1 from __future__ import unicode_literals
16 from ..compat import (
19 compat_etree_fromstring,
25 compat_urllib_parse_unquote,
26 compat_urllib_parse_urlencode,
27 compat_urllib_request,
30 from ..downloader.f4m import remove_encrypted_media
63 parse_m3u8_attributes,
70 class InfoExtractor(object):
71 """Information Extractor class.
73 Information extractors are the classes that, given a URL, extract
74 information about the video (or videos) the URL refers to. This
75 information includes the real video URL, the video title, author and
76 others. The information is stored in a dictionary which is then
77 passed to the YoutubeDL. The YoutubeDL processes this
78 information possibly downloading the video to the file system, among
79 other possible outcomes.
81 The type field determines the type of the result.
82 By far the most common value (and the default if _type is missing) is
83 "video", which indicates a single video.
85 For a video, the dictionaries must include the following fields:
88 title: Video title, unescaped.
90 Additionally, it must contain either a formats entry or a url one:
92 formats: A list of dictionaries for each format available, ordered
93 from worst to best quality.
96 * url Mandatory. The URL of the video file
98 The URL of the manifest file in case of
99 fragmented media (DASH, hls, hds)
100 * ext Will be calculated from URL if missing
101 * format A human-readable description of the format
102 ("mp4 container with h264/opus").
103 Calculated from the format_id, width, height.
104 and format_note fields if missing.
105 * format_id A short description of the format
106 ("mp4_h264_opus" or "19").
107 Technically optional, but strongly recommended.
108 * format_note Additional info about the format
109 ("3D" or "DASH video")
110 * width Width of the video, if known
111 * height Height of the video, if known
112 * resolution Textual description of width and height
113 * tbr Average bitrate of audio and video in KBit/s
114 * abr Average audio bitrate in KBit/s
115 * acodec Name of the audio codec in use
116 * asr Audio sampling rate in Hertz
117 * vbr Average video bitrate in KBit/s
119 * vcodec Name of the video codec in use
120 * container Name of the container format
121 * filesize The number of bytes, if known in advance
122 * filesize_approx An estimate for the number of bytes
123 * player_url SWF Player URL (used for rtmpdump).
124 * protocol The protocol that will be used for the actual
125 download, lower-case.
126 "http", "https", "rtsp", "rtmp", "rtmpe",
127 "m3u8", "m3u8_native" or "http_dash_segments".
129 Base URL for fragments. Each fragment's path
130 value (if present) will be relative to
132 * fragments A list of fragments of a fragmented media.
133 Each fragment entry must contain either an url
134 or a path. If an url is present it should be
135 considered by a client. Otherwise both path and
136 fragment_base_url must be present. Here is
137 the list of all potential fields:
138 * "url" - fragment's URL
139 * "path" - fragment's path relative to
141 * "duration" (optional, int or float)
142 * "filesize" (optional, int)
143 * preference Order number of this format. If this field is
144 present and not None, the formats get sorted
145 by this field, regardless of all other values.
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 < -1000 to hide the format (if there is
149 another one which is strictly better)
150 * language Language code, e.g. "de" or "en-US".
151 * language_preference Is this in the language mentioned in
153 10 if it's what the URL is about,
154 -1 for default (don't know),
155 -10 otherwise, other values reserved for now.
156 * quality Order number of the video quality of this
157 format, irrespective of the file format.
158 -1 for default (order by other properties),
159 -2 or smaller for less than default.
160 * source_preference Order number for this video source
161 (quality takes higher priority)
162 -1 for default (order by other properties),
163 -2 or smaller for less than default.
164 * http_headers A dictionary of additional HTTP headers
165 to add to the request.
166 * stretched_ratio If given and not 1, indicates that the
167 video's pixels are not square.
168 width : height ratio as float.
169 * no_resume The server does not support resuming the
170 (HTTP or RTMP) download. Boolean.
172 url: Final video URL.
173 ext: Video filename extension.
174 format: The video format, defaults to ext (used for --get-format)
175 player_url: SWF Player URL (used for rtmpdump).
177 The following fields are optional:
179 alt_title: A secondary title of the video.
180 display_id An alternative identifier for the video, not necessarily
181 unique, but available before title. Typically, id is
182 something like "4234987", title "Dancing naked mole rats",
183 and display_id "dancing-naked-mole-rats"
184 thumbnails: A list of dictionaries, with the following entries:
185 * "id" (optional, string) - Thumbnail format ID
187 * "preference" (optional, int) - quality of the image
188 * "width" (optional, int)
189 * "height" (optional, int)
190 * "resolution" (optional, string "{width}x{height"},
192 * "filesize" (optional, int)
193 thumbnail: Full URL to a video thumbnail image.
194 description: Full video description.
195 uploader: Full name of the video uploader.
196 license: License name the video is licensed under.
197 creator: The creator of the video.
198 release_date: The date (YYYYMMDD) when the video was released.
199 timestamp: UNIX timestamp of the moment the video became available.
200 upload_date: Video upload date (YYYYMMDD).
201 If not explicitly set, calculated from timestamp.
202 uploader_id: Nickname or id of the video uploader.
203 uploader_url: Full URL to a personal webpage of the video uploader.
204 location: Physical location where the video was filmed.
205 subtitles: The available subtitles as a dictionary in the format
206 {tag: subformats}. "tag" is usually a language code, and
207 "subformats" is a list sorted from lower to higher
208 preference, each element is a dictionary with the "ext"
210 * "data": The subtitles file contents
211 * "url": A URL pointing to the subtitles file
212 "ext" will be calculated from URL if missing
213 automatic_captions: Like 'subtitles', used by the YoutubeIE for
214 automatically generated captions
215 duration: Length of the video in seconds, as an integer or float.
216 view_count: How many users have watched the video on the platform.
217 like_count: Number of positive ratings of the video
218 dislike_count: Number of negative ratings of the video
219 repost_count: Number of reposts of the video
220 average_rating: Average rating give by users, the scale used depends on the webpage
221 comment_count: Number of comments on the video
222 comments: A list of comments, each with one or more of the following
223 properties (all but one of text or html optional):
224 * "author" - human-readable name of the comment author
225 * "author_id" - user ID of the comment author
227 * "html" - Comment as HTML
228 * "text" - Plain text of the comment
229 * "timestamp" - UNIX timestamp of comment
230 * "parent" - ID of the comment this one is replying to.
231 Set to "root" to indicate that this is a
232 comment to the original video.
233 age_limit: Age restriction for the video, as an integer (years)
234 webpage_url: The URL to the video webpage, if given to youtube-dl it
235 should allow to get the same result again. (It will be set
236 by YoutubeDL if it's missing)
237 categories: A list of categories that the video falls in, for example
239 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240 is_live: True, False, or None (=unknown). Whether this video is a
241 live stream that goes on instead of a fixed-length video.
242 start_time: Time in seconds where the reproduction should start, as
243 specified in the URL.
244 end_time: Time in seconds where the reproduction should end, as
245 specified in the URL.
247 The following fields should only be used when the video belongs to some logical
250 chapter: Name or title of the chapter the video belongs to.
251 chapter_number: Number of the chapter the video belongs to, as an integer.
252 chapter_id: Id of the chapter the video belongs to, as a unicode string.
254 The following fields should only be used when the video is an episode of some
255 series, programme or podcast:
257 series: Title of the series or programme the video episode belongs to.
258 season: Title of the season the video episode belongs to.
259 season_number: Number of the season the video episode belongs to, as an integer.
260 season_id: Id of the season the video episode belongs to, as a unicode string.
261 episode: Title of the video episode. Unlike mandatory video title field,
262 this field should denote the exact title of the video episode
263 without any kind of decoration.
264 episode_number: Number of the video episode within a season, as an integer.
265 episode_id: Id of the video episode, as a unicode string.
267 The following fields should only be used when the media is a track or a part of
270 track: Title of the track.
271 track_number: Number of the track within an album or a disc, as an integer.
272 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 artist: Artist(s) of the track.
275 genre: Genre(s) of the track.
276 album: Title of the album the track belongs to.
277 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278 album_artist: List of all artists appeared on the album (e.g.
279 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 disc_number: Number of the disc or other physical medium the track belongs to,
283 release_year: Year (YYYY) when the album was released.
285 Unless mentioned otherwise, the fields should be Unicode strings.
287 Unless mentioned otherwise, None is equivalent to absence of information.
290 _type "playlist" indicates multiple videos.
291 There must be a key "entries", which is a list, an iterable, or a PagedList
292 object, each element of which is a valid dictionary by this specification.
294 Additionally, playlists can have "title", "description" and "id" attributes
295 with the same semantics as videos (see above).
298 _type "multi_video" indicates that there are multiple videos that
299 form a single show, for examples multiple acts of an opera or TV episode.
300 It must have an entries key like a playlist and contain all the keys
301 required for a video at the same time.
304 _type "url" indicates that the video must be extracted from another
305 location, possibly by a different extractor. Its only required key is:
306 "url" - the next URL to extract.
307 The key "ie_key" can be set to the class name (minus the trailing "IE",
308 e.g. "Youtube") if the extractor class is known in advance.
309 Additionally, the dictionary may have any properties of the resolved entity
310 known in advance, for example "title" if the title of the referred video is
314 _type "url_transparent" entities have the same specification as "url", but
315 indicate that the given additional information is more precise than the one
316 associated with the resolved URL.
317 This is useful when a site employs a video service that hosts the video and
318 its technical metadata, but that video service does not embed a useful
319 title, description etc.
322 Subclasses of this one should re-define the _real_initialize() and
323 _real_extract() methods and define a _VALID_URL regexp.
324 Probably, they should also be added to the list of extractors.
326 _GEO_BYPASS attribute may be set to False in order to disable
327 geo restriction bypass mechanisms for a particular extractor.
328 Though it won't disable explicit geo restriction bypass based on
329 country code provided with geo_bypass_country. (experimental)
331 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
332 countries for this extractor. One of these countries will be used by
333 geo restriction bypass mechanism right away in order to bypass
334 geo restriction, of course, if the mechanism is not disabled. (experimental)
336 NB: both these geo attributes are experimental and may change in future
337 or be completely removed.
339 Finally, the _WORKING attribute should be set to False for broken IEs
340 in order to warn the users and skip the tests.
345 _x_forwarded_for_ip = None
347 _GEO_COUNTRIES = None
350 def __init__(self, downloader=None):
351 """Constructor. Receives an optional downloader."""
353 self._x_forwarded_for_ip = None
354 self.set_downloader(downloader)
357 def suitable(cls, url):
358 """Receives a URL and returns True if suitable for this IE."""
360 # This does not use has/getattr intentionally - we want to know whether
361 # we have cached the regexp for *this* class, whereas getattr would also
362 # match the superclass
363 if '_VALID_URL_RE' not in cls.__dict__:
364 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
365 return cls._VALID_URL_RE.match(url) is not None
368 def _match_id(cls, url):
369 if '_VALID_URL_RE' not in cls.__dict__:
370 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
371 m = cls._VALID_URL_RE.match(url)
377 """Getter method for _WORKING."""
380 def initialize(self):
381 """Initializes an instance (authentication, etc)."""
382 self._initialize_geo_bypass(self._GEO_COUNTRIES)
384 self._real_initialize()
387 def _initialize_geo_bypass(self, countries):
389 Initialize geo restriction bypass mechanism.
391 This method is used to initialize geo bypass mechanism based on faking
392 X-Forwarded-For HTTP header. A random country from provided country list
393 is selected and a random IP belonging to this country is generated. This
394 IP will be passed as X-Forwarded-For HTTP header in all subsequent
397 This method will be used for initial geo bypass mechanism initialization
398 during the instance initialization with _GEO_COUNTRIES.
400 You may also manually call it from extractor's code if geo countries
401 information is not available beforehand (e.g. obtained during
402 extraction) or due to some another reason.
404 if not self._x_forwarded_for_ip:
405 country_code = self._downloader.params.get('geo_bypass_country', None)
406 # If there is no explicit country for geo bypass specified and
407 # the extractor is known to be geo restricted let's fake IP
408 # as X-Forwarded-For right away.
409 if (not country_code and
411 self._downloader.params.get('geo_bypass', True) and
413 country_code = random.choice(countries)
415 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
416 if self._downloader.params.get('verbose', False):
417 self._downloader.to_stdout(
418 '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
420 def extract(self, url):
421 """Extracts URL information and returns it in list of dicts."""
426 ie_result = self._real_extract(url)
427 if self._x_forwarded_for_ip:
428 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
430 except GeoRestrictedError as e:
431 if self.__maybe_fake_ip_and_retry(e.countries):
434 except ExtractorError:
436 except compat_http_client.IncompleteRead as e:
437 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
438 except (KeyError, StopIteration) as e:
439 raise ExtractorError('An extractor error has occurred.', cause=e)
441 def __maybe_fake_ip_and_retry(self, countries):
442 if (not self._downloader.params.get('geo_bypass_country', None) and
444 self._downloader.params.get('geo_bypass', True) and
445 not self._x_forwarded_for_ip and
447 self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
448 if self._x_forwarded_for_ip:
450 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
454 def set_downloader(self, downloader):
455 """Sets the downloader for this IE."""
456 self._downloader = downloader
458 def _real_initialize(self):
459 """Real initialization process. Redefine in subclasses."""
462 def _real_extract(self, url):
463 """Real extraction process. Redefine in subclasses."""
468 """A string for getting the InfoExtractor with get_info_extractor"""
469 return compat_str(cls.__name__[:-2])
473 return compat_str(type(self).__name__[:-2])
475 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
476 """ Returns the response handle """
478 self.report_download_webpage(video_id)
479 elif note is not False:
481 self.to_screen('%s' % (note,))
483 self.to_screen('%s: %s' % (video_id, note))
484 if isinstance(url_or_request, compat_urllib_request.Request):
485 url_or_request = update_Request(
486 url_or_request, data=data, headers=headers, query=query)
489 url_or_request = update_url_query(url_or_request, query)
490 if data is not None or headers:
491 url_or_request = sanitized_Request(url_or_request, data, headers)
493 return self._downloader.urlopen(url_or_request)
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 errnote = 'Unable to download webpage'
500 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
502 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
504 self._downloader.report_warning(errmsg)
507 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
508 """ Returns a tuple (page content as string, URL handle) """
509 # Strip hashes from the URL (#1038)
510 if isinstance(url_or_request, (compat_str, str)):
511 url_or_request = url_or_request.partition('#')[0]
513 # Some sites check X-Forwarded-For HTTP header in order to figure out
514 # the origin of the client behind proxy. This allows bypassing geo
515 # restriction by faking this header's value to IP that belongs to some
516 # geo unrestricted country. We will do so once we encounter any
517 # geo restriction error.
518 if self._x_forwarded_for_ip:
519 if 'X-Forwarded-For' not in headers:
520 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
522 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
526 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
527 return (content, urlh)
530 def _guess_encoding_from_content(content_type, webpage_bytes):
531 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
533 encoding = m.group(1)
535 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
536 webpage_bytes[:1024])
538 encoding = m.group(1).decode('ascii')
539 elif webpage_bytes.startswith(b'\xff\xfe'):
546 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
547 content_type = urlh.headers.get('Content-Type', '')
548 webpage_bytes = urlh.read()
549 if prefix is not None:
550 webpage_bytes = prefix + webpage_bytes
552 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
553 if self._downloader.params.get('dump_intermediate_pages', False):
555 url = url_or_request.get_full_url()
556 except AttributeError:
558 self.to_screen('Dumping request to ' + url)
559 dump = base64.b64encode(webpage_bytes).decode('ascii')
560 self._downloader.to_screen(dump)
561 if self._downloader.params.get('write_pages', False):
563 url = url_or_request.get_full_url()
564 except AttributeError:
566 basen = '%s_%s' % (video_id, url)
568 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
569 basen = basen[:240 - len(h)] + h
570 raw_filename = basen + '.dump'
571 filename = sanitize_filename(raw_filename, restricted=True)
572 self.to_screen('Saving request to ' + filename)
573 # Working around MAX_PATH limitation on Windows (see
574 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
575 if compat_os_name == 'nt':
576 absfilepath = os.path.abspath(filename)
577 if len(absfilepath) > 259:
578 filename = '\\\\?\\' + absfilepath
579 with open(filename, 'wb') as outf:
580 outf.write(webpage_bytes)
583 content = webpage_bytes.decode(encoding, 'replace')
585 content = webpage_bytes.decode('utf-8', 'replace')
587 if ('<title>Access to this site is blocked</title>' in content and
588 'Websense' in content[:512]):
589 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
590 blocked_iframe = self._html_search_regex(
591 r'<iframe src="([^"]+)"', content,
592 'Websense information URL', default=None)
594 msg += ' Visit %s for more details' % blocked_iframe
595 raise ExtractorError(msg, expected=True)
596 if '<title>The URL you requested has been blocked</title>' in content[:512]:
598 'Access to this webpage has been blocked by Indian censorship. '
599 'Use a VPN or proxy server (with --proxy) to route around it.')
600 block_msg = self._html_search_regex(
601 r'</h1><p>(.*?)</p>',
602 content, 'block message', default=None)
604 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
605 raise ExtractorError(msg, expected=True)
609 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
610 """ Returns the data of the page as a string """
613 while success is False:
615 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
617 except compat_http_client.IncompleteRead as e:
619 if try_count >= tries:
621 self._sleep(timeout, video_id)
628 def _download_xml(self, url_or_request, video_id,
629 note='Downloading XML', errnote='Unable to download XML',
630 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
631 """Return the xml as an xml.etree.ElementTree.Element"""
632 xml_string = self._download_webpage(
633 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
634 if xml_string is False:
637 xml_string = transform_source(xml_string)
638 return compat_etree_fromstring(xml_string.encode('utf-8'))
640 def _download_json(self, url_or_request, video_id,
641 note='Downloading JSON metadata',
642 errnote='Unable to download JSON metadata',
643 transform_source=None,
644 fatal=True, encoding=None, data=None, headers={}, query={}):
645 json_string = self._download_webpage(
646 url_or_request, video_id, note, errnote, fatal=fatal,
647 encoding=encoding, data=data, headers=headers, query=query)
648 if (not fatal) and json_string is False:
650 return self._parse_json(
651 json_string, video_id, transform_source=transform_source, fatal=fatal)
653 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
655 json_string = transform_source(json_string)
657 return json.loads(json_string)
658 except ValueError as ve:
659 errmsg = '%s: Failed to parse JSON ' % video_id
661 raise ExtractorError(errmsg, cause=ve)
663 self.report_warning(errmsg + str(ve))
665 def report_warning(self, msg, video_id=None):
666 idstr = '' if video_id is None else '%s: ' % video_id
667 self._downloader.report_warning(
668 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
670 def to_screen(self, msg):
671 """Print msg to screen, prefixing it with '[ie_name]'"""
672 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
674 def report_extraction(self, id_or_name):
675 """Report information extraction."""
676 self.to_screen('%s: Extracting information' % id_or_name)
678 def report_download_webpage(self, video_id):
679 """Report webpage download."""
680 self.to_screen('%s: Downloading webpage' % video_id)
682 def report_age_confirmation(self):
683 """Report attempt to confirm age."""
684 self.to_screen('Confirming age')
686 def report_login(self):
687 """Report attempt to log in."""
688 self.to_screen('Logging in')
691 def raise_login_required(msg='This video is only available for registered users'):
692 raise ExtractorError(
693 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
697 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
698 raise GeoRestrictedError(msg, countries=countries)
700 # Methods for following #608
702 def url_result(url, ie=None, video_id=None, video_title=None):
703 """Returns a URL that points to a page that should be processed"""
704 # TODO: ie should be the class used for getting the info
705 video_info = {'_type': 'url',
708 if video_id is not None:
709 video_info['id'] = video_id
710 if video_title is not None:
711 video_info['title'] = video_title
715 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
716 """Returns a playlist"""
717 video_info = {'_type': 'playlist',
720 video_info['id'] = playlist_id
722 video_info['title'] = playlist_title
723 if playlist_description:
724 video_info['description'] = playlist_description
727 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
729 Perform a regex search on the given string, using a single or a list of
730 patterns returning the first matching group.
731 In case of failure return a default value or raise a WARNING or a
732 RegexNotFoundError, depending on fatal, specifying the field name.
734 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
735 mobj = re.search(pattern, string, flags)
738 mobj = re.search(p, string, flags)
742 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
743 _name = '\033[0;34m%s\033[0m' % name
749 # return the first matching group
750 return next(g for g in mobj.groups() if g is not None)
752 return mobj.group(group)
753 elif default is not NO_DEFAULT:
756 raise RegexNotFoundError('Unable to extract %s' % _name)
758 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
761 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
763 Like _search_regex, but strips HTML tags and unescapes entities.
765 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
767 return clean_html(res).strip()
771 def _get_netrc_login_info(self, netrc_machine=None):
774 netrc_machine = netrc_machine or self._NETRC_MACHINE
776 if self._downloader.params.get('usenetrc', False):
778 info = netrc.netrc().authenticators(netrc_machine)
783 raise netrc.NetrcParseError(
784 'No authenticators for %s' % netrc_machine)
785 except (IOError, netrc.NetrcParseError) as err:
786 self._downloader.report_warning(
787 'parsing .netrc: %s' % error_to_compat_str(err))
789 return username, password
791 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
793 Get the login info as (username, password)
794 First look for the manually specified credentials using username_option
795 and password_option as keys in params dictionary. If no such credentials
796 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
798 If there's no info available, return (None, None)
800 if self._downloader is None:
803 downloader_params = self._downloader.params
805 # Attempt to use provided username and password or .netrc data
806 if downloader_params.get(username_option) is not None:
807 username = downloader_params[username_option]
808 password = downloader_params[password_option]
810 username, password = self._get_netrc_login_info(netrc_machine)
812 return username, password
814 def _get_tfa_info(self, note='two-factor verification code'):
816 Get the two-factor authentication info
817 TODO - asking the user will be required for sms/phone verify
818 currently just uses the command line option
819 If there's no info available, return None
821 if self._downloader is None:
823 downloader_params = self._downloader.params
825 if downloader_params.get('twofactor') is not None:
826 return downloader_params['twofactor']
828 return compat_getpass('Type %s and press [Return]: ' % note)
830 # Helper functions for extracting OpenGraph info
832 def _og_regexes(prop):
833 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
834 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
835 % {'prop': re.escape(prop)})
836 template = r'<meta[^>]+?%s[^>]+?%s'
838 template % (property_re, content_re),
839 template % (content_re, property_re),
843 def _meta_regex(prop):
844 return r'''(?isx)<meta
845 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
846 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
848 def _og_search_property(self, prop, html, name=None, **kargs):
849 if not isinstance(prop, (list, tuple)):
852 name = 'OpenGraph %s' % prop[0]
855 og_regexes.extend(self._og_regexes(p))
856 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
859 return unescapeHTML(escaped)
861 def _og_search_thumbnail(self, html, **kargs):
862 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
864 def _og_search_description(self, html, **kargs):
865 return self._og_search_property('description', html, fatal=False, **kargs)
867 def _og_search_title(self, html, **kargs):
868 return self._og_search_property('title', html, **kargs)
870 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
871 regexes = self._og_regexes('video') + self._og_regexes('video:url')
873 regexes = self._og_regexes('video:secure_url') + regexes
874 return self._html_search_regex(regexes, html, name, **kargs)
876 def _og_search_url(self, html, **kargs):
877 return self._og_search_property('url', html, **kargs)
879 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
880 if not isinstance(name, (list, tuple)):
882 if display_name is None:
883 display_name = name[0]
884 return self._html_search_regex(
885 [self._meta_regex(n) for n in name],
886 html, display_name, fatal=fatal, group='content', **kwargs)
888 def _dc_search_uploader(self, html):
889 return self._html_search_meta('dc.creator', html, 'uploader')
891 def _rta_search(self, html):
892 # See http://www.rtalabel.org/index.php?content=howtofaq#single
893 if re.search(r'(?ix)<meta\s+name="rating"\s+'
894 r' content="RTA-5042-1996-1400-1577-RTA"',
899 def _media_rating_search(self, html):
900 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
901 rating = self._html_search_meta('rating', html)
913 return RATING_TABLE.get(rating.lower())
915 def _family_friendly_search(self, html):
916 # See http://schema.org/VideoObject
917 family_friendly = self._html_search_meta('isFamilyFriendly', html)
919 if not family_friendly:
928 return RATING_TABLE.get(family_friendly.lower())
930 def _twitter_search_player(self, html):
931 return self._html_search_meta('twitter:player', html,
932 'twitter card player')
934 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
935 json_ld = self._search_regex(
936 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
937 html, 'JSON-LD', group='json_ld', **kwargs)
938 default = kwargs.get('default', NO_DEFAULT)
940 return default if default is not NO_DEFAULT else {}
941 # JSON-LD may be malformed and thus `fatal` should be respected.
942 # At the same time `default` may be passed that assumes `fatal=False`
943 # for _search_regex. Let's simulate the same behavior here as well.
944 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
945 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
947 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
948 if isinstance(json_ld, compat_str):
949 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
953 if not isinstance(json_ld, (list, tuple, dict)):
955 if isinstance(json_ld, dict):
958 if e.get('@context') == 'http://schema.org':
959 item_type = e.get('@type')
960 if expected_type is not None and expected_type != item_type:
962 if item_type == 'TVEpisode':
964 'episode': unescapeHTML(e.get('name')),
965 'episode_number': int_or_none(e.get('episodeNumber')),
966 'description': unescapeHTML(e.get('description')),
968 part_of_season = e.get('partOfSeason')
969 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
970 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
971 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
972 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
973 info['series'] = unescapeHTML(part_of_series.get('name'))
974 elif item_type == 'Article':
976 'timestamp': parse_iso8601(e.get('datePublished')),
977 'title': unescapeHTML(e.get('headline')),
978 'description': unescapeHTML(e.get('articleBody')),
980 elif item_type == 'VideoObject':
982 'url': e.get('contentUrl'),
983 'title': unescapeHTML(e.get('name')),
984 'description': unescapeHTML(e.get('description')),
985 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
986 'duration': parse_duration(e.get('duration')),
987 'timestamp': unified_timestamp(e.get('uploadDate')),
988 'filesize': float_or_none(e.get('contentSize')),
989 'tbr': int_or_none(e.get('bitrate')),
990 'width': int_or_none(e.get('width')),
991 'height': int_or_none(e.get('height')),
994 return dict((k, v) for k, v in info.items() if v is not None)
997 def _hidden_inputs(html):
998 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1000 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1001 attrs = extract_attributes(input)
1004 if attrs.get('type') not in ('hidden', 'submit'):
1006 name = attrs.get('name') or attrs.get('id')
1007 value = attrs.get('value')
1008 if name and value is not None:
1009 hidden_inputs[name] = value
1010 return hidden_inputs
1012 def _form_hidden_inputs(self, form_id, html):
1013 form = self._search_regex(
1014 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1015 html, '%s form' % form_id, group='form')
1016 return self._hidden_inputs(form)
1018 def _sort_formats(self, formats, field_preference=None):
1020 raise ExtractorError('No video formats found')
1023 # Automatically determine tbr when missing based on abr and vbr (improves
1024 # formats sorting in some cases)
1025 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1026 f['tbr'] = f['abr'] + f['vbr']
1028 def _formats_key(f):
1029 # TODO remove the following workaround
1030 from ..utils import determine_ext
1031 if not f.get('ext') and 'url' in f:
1032 f['ext'] = determine_ext(f['url'])
1034 if isinstance(field_preference, (list, tuple)):
1037 if f.get(field) is not None
1038 else ('' if field == 'format_id' else -1)
1039 for field in field_preference)
1041 preference = f.get('preference')
1042 if preference is None:
1044 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1047 protocol = f.get('protocol') or determine_protocol(f)
1048 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1050 if f.get('vcodec') == 'none': # audio only
1052 if self._downloader.params.get('prefer_free_formats'):
1053 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1055 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1058 audio_ext_preference = ORDER.index(f['ext'])
1060 audio_ext_preference = -1
1062 if f.get('acodec') == 'none': # video only
1064 if self._downloader.params.get('prefer_free_formats'):
1065 ORDER = ['flv', 'mp4', 'webm']
1067 ORDER = ['webm', 'flv', 'mp4']
1069 ext_preference = ORDER.index(f['ext'])
1072 audio_ext_preference = 0
1076 f.get('language_preference') if f.get('language_preference') is not None else -1,
1077 f.get('quality') if f.get('quality') is not None else -1,
1078 f.get('tbr') if f.get('tbr') is not None else -1,
1079 f.get('filesize') if f.get('filesize') is not None else -1,
1080 f.get('vbr') if f.get('vbr') is not None else -1,
1081 f.get('height') if f.get('height') is not None else -1,
1082 f.get('width') if f.get('width') is not None else -1,
1085 f.get('abr') if f.get('abr') is not None else -1,
1086 audio_ext_preference,
1087 f.get('fps') if f.get('fps') is not None else -1,
1088 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1089 f.get('source_preference') if f.get('source_preference') is not None else -1,
1090 f.get('format_id') if f.get('format_id') is not None else '',
1092 formats.sort(key=_formats_key)
1094 def _check_formats(self, formats, video_id):
1096 formats[:] = filter(
1097 lambda f: self._is_valid_url(
1099 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1103 def _remove_duplicate_formats(formats):
1107 if f['url'] not in format_urls:
1108 format_urls.add(f['url'])
1109 unique_formats.append(f)
1110 formats[:] = unique_formats
1112 def _is_valid_url(self, url, video_id, item='video', headers={}):
1113 url = self._proto_relative_url(url, scheme='http:')
1114 # For now assume non HTTP(S) URLs always valid
1115 if not (url.startswith('http://') or url.startswith('https://')):
1118 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1120 except ExtractorError as e:
1121 if isinstance(e.cause, compat_urllib_error.URLError):
1123 '%s: %s URL is invalid, skipping' % (video_id, item))
1127 def http_scheme(self):
1128 """ Either "http:" or "https:", depending on the user's preferences """
1131 if self._downloader.params.get('prefer_insecure', False)
1134 def _proto_relative_url(self, url, scheme=None):
1137 if url.startswith('//'):
1139 scheme = self.http_scheme()
1144 def _sleep(self, timeout, video_id, msg_template=None):
1145 if msg_template is None:
1146 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1147 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1151 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1152 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1153 fatal=True, m3u8_id=None):
1154 manifest = self._download_xml(
1155 manifest_url, video_id, 'Downloading f4m manifest',
1156 'Unable to download f4m manifest',
1157 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1158 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1159 transform_source=transform_source,
1162 if manifest is False:
1165 return self._parse_f4m_formats(
1166 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1167 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1169 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1170 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1171 fatal=True, m3u8_id=None):
1172 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1173 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1174 if akamai_pv is not None and ';' in akamai_pv.text:
1175 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1176 if playerVerificationChallenge.strip() != '':
1180 manifest_version = '1.0'
1181 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1183 manifest_version = '2.0'
1184 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1185 # Remove unsupported DRM protected media from final formats
1186 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1187 media_nodes = remove_encrypted_media(media_nodes)
1190 base_url = xpath_text(
1191 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1192 'base URL', default=None)
1194 base_url = base_url.strip()
1196 bootstrap_info = xpath_element(
1197 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1198 'bootstrap info', default=None)
1201 mime_type = xpath_text(
1202 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1203 'base URL', default=None)
1204 if mime_type and mime_type.startswith('audio/'):
1207 for i, media_el in enumerate(media_nodes):
1208 tbr = int_or_none(media_el.attrib.get('bitrate'))
1209 width = int_or_none(media_el.attrib.get('width'))
1210 height = int_or_none(media_el.attrib.get('height'))
1211 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1212 # If <bootstrapInfo> is present, the specified f4m is a
1213 # stream-level manifest, and only set-level manifests may refer to
1214 # external resources. See section 11.4 and section 4 of F4M spec
1215 if bootstrap_info is None:
1217 # @href is introduced in 2.0, see section 11.6 of F4M spec
1218 if manifest_version == '2.0':
1219 media_url = media_el.attrib.get('href')
1220 if media_url is None:
1221 media_url = media_el.attrib.get('url')
1225 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1226 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1227 # If media_url is itself a f4m manifest do the recursive extraction
1228 # since bitrates in parent manifest (this one) and media_url manifest
1229 # may differ leading to inability to resolve the format by requested
1230 # bitrate in f4m downloader
1231 ext = determine_ext(manifest_url)
1233 f4m_formats = self._extract_f4m_formats(
1234 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1235 transform_source=transform_source, fatal=fatal)
1236 # Sometimes stream-level manifest contains single media entry that
1237 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1238 # At the same time parent's media entry in set-level manifest may
1239 # contain it. We will copy it from parent in such cases.
1240 if len(f4m_formats) == 1:
1243 'tbr': f.get('tbr') or tbr,
1244 'width': f.get('width') or width,
1245 'height': f.get('height') or height,
1246 'format_id': f.get('format_id') if not tbr else format_id,
1249 formats.extend(f4m_formats)
1252 formats.extend(self._extract_m3u8_formats(
1253 manifest_url, video_id, 'mp4', preference=preference,
1254 m3u8_id=m3u8_id, fatal=fatal))
1257 'format_id': format_id,
1258 'url': manifest_url,
1259 'manifest_url': manifest_url,
1260 'ext': 'flv' if bootstrap_info is not None else None,
1265 'preference': preference,
1269 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1271 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1275 'preference': preference - 100 if preference else -100,
1276 'resolution': 'multiple',
1277 'format_note': 'Quality selection URL',
1280 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1281 entry_protocol='m3u8', preference=None,
1282 m3u8_id=None, note=None, errnote=None,
1283 fatal=True, live=False):
1285 res = self._download_webpage_handle(
1287 note=note or 'Downloading m3u8 information',
1288 errnote=errnote or 'Failed to download m3u8 information',
1292 m3u8_doc, urlh = res
1293 m3u8_url = urlh.geturl()
1295 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1298 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1300 format_url = lambda u: (
1302 if re.match(r'^https?://', u)
1303 else compat_urlparse.urljoin(m3u8_url, u))
1305 # We should try extracting formats only from master playlists [1], i.e.
1306 # playlists that describe available qualities. On the other hand media
1307 # playlists [2] should be returned as is since they contain just the media
1308 # without qualities renditions.
1309 # Fortunately, master playlist can be easily distinguished from media
1310 # playlist based on particular tags availability. As of [1, 2] master
1311 # playlist tags MUST NOT appear in a media playist and vice versa.
1312 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1313 # and MUST NOT appear in master playlist thus we can clearly detect media
1314 # playlist with this criterion.
1315 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1316 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1317 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1318 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1321 'format_id': m3u8_id,
1323 'protocol': entry_protocol,
1324 'preference': preference,
1326 audio_in_video_stream = {}
1329 for line in m3u8_doc.splitlines():
1330 if line.startswith('#EXT-X-STREAM-INF:'):
1331 last_info = parse_m3u8_attributes(line)
1332 elif line.startswith('#EXT-X-MEDIA:'):
1333 media = parse_m3u8_attributes(line)
1334 media_type = media.get('TYPE')
1335 if media_type in ('VIDEO', 'AUDIO'):
1336 group_id = media.get('GROUP-ID')
1337 media_url = media.get('URI')
1340 for v in (group_id, media.get('NAME')):
1344 'format_id': '-'.join(format_id),
1345 'url': format_url(media_url),
1346 'language': media.get('LANGUAGE'),
1348 'protocol': entry_protocol,
1349 'preference': preference,
1351 if media_type == 'AUDIO':
1352 f['vcodec'] = 'none'
1353 if group_id and not audio_in_video_stream.get(group_id):
1354 audio_in_video_stream[group_id] = False
1357 # When there is no URI in EXT-X-MEDIA let this tag's
1358 # data be used by regular URI lines below
1360 if media_type == 'AUDIO' and group_id:
1361 audio_in_video_stream[group_id] = True
1362 elif line.startswith('#') or not line.strip():
1365 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1368 format_id.append(m3u8_id)
1369 # Despite specification does not mention NAME attribute for
1370 # EXT-X-STREAM-INF it still sometimes may be present
1371 stream_name = last_info.get('NAME') or last_media.get('NAME')
1372 # Bandwidth of live streams may differ over time thus making
1373 # format_id unpredictable. So it's better to keep provided
1376 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1377 manifest_url = format_url(line.strip())
1379 'format_id': '-'.join(format_id),
1380 'url': manifest_url,
1381 'manifest_url': manifest_url,
1384 'fps': float_or_none(last_info.get('FRAME-RATE')),
1385 'protocol': entry_protocol,
1386 'preference': preference,
1388 resolution = last_info.get('RESOLUTION')
1390 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1392 f['width'] = int(mobj.group('width'))
1393 f['height'] = int(mobj.group('height'))
1394 # Unified Streaming Platform
1396 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1398 abr, vbr = mobj.groups()
1399 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1404 f.update(parse_codecs(last_info.get('CODECS')))
1405 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1406 # TODO: update acodec for audio only formats with the same GROUP-ID
1407 f['acodec'] = 'none'
1414 def _xpath_ns(path, namespace=None):
1418 for c in path.split('/'):
1419 if not c or c == '.':
1422 out.append('{%s}%s' % (namespace, c))
1423 return '/'.join(out)
1425 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1426 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1432 namespace = self._parse_smil_namespace(smil)
1434 return self._parse_smil_formats(
1435 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1437 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1438 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1441 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1443 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1444 return self._download_xml(
1445 smil_url, video_id, 'Downloading SMIL file',
1446 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1448 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1449 namespace = self._parse_smil_namespace(smil)
1451 formats = self._parse_smil_formats(
1452 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1453 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1455 video_id = os.path.splitext(url_basename(smil_url))[0]
1459 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1460 name = meta.attrib.get('name')
1461 content = meta.attrib.get('content')
1462 if not name or not content:
1464 if not title and name == 'title':
1466 elif not description and name in ('description', 'abstract'):
1467 description = content
1468 elif not upload_date and name == 'date':
1469 upload_date = unified_strdate(content)
1472 'id': image.get('type'),
1473 'url': image.get('src'),
1474 'width': int_or_none(image.get('width')),
1475 'height': int_or_none(image.get('height')),
1476 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1480 'title': title or video_id,
1481 'description': description,
1482 'upload_date': upload_date,
1483 'thumbnails': thumbnails,
1485 'subtitles': subtitles,
1488 def _parse_smil_namespace(self, smil):
1489 return self._search_regex(
1490 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1492 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1494 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1495 b = meta.get('base') or meta.get('httpBase')
1506 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1507 for medium in media:
1508 src = medium.get('src')
1509 if not src or src in srcs:
1513 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1514 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1515 width = int_or_none(medium.get('width'))
1516 height = int_or_none(medium.get('height'))
1517 proto = medium.get('proto')
1518 ext = medium.get('ext')
1519 src_ext = determine_ext(src)
1520 streamer = medium.get('streamer') or base
1522 if proto == 'rtmp' or streamer.startswith('rtmp'):
1528 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1530 'filesize': filesize,
1534 if transform_rtmp_url:
1535 streamer, src = transform_rtmp_url(streamer, src)
1536 formats[-1].update({
1542 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1543 src_url = src_url.strip()
1545 if proto == 'm3u8' or src_ext == 'm3u8':
1546 m3u8_formats = self._extract_m3u8_formats(
1547 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1548 if len(m3u8_formats) == 1:
1550 m3u8_formats[0].update({
1551 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1556 formats.extend(m3u8_formats)
1559 if src_ext == 'f4m':
1564 'plugin': 'flowplayer-3.2.0.1',
1566 f4m_url += '&' if '?' in f4m_url else '?'
1567 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1568 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1571 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1575 'ext': ext or src_ext or 'flv',
1576 'format_id': 'http-%d' % (bitrate or http_count),
1578 'filesize': filesize,
1586 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1589 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1590 src = textstream.get('src')
1591 if not src or src in urls:
1594 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1595 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1596 subtitles.setdefault(lang, []).append({
1602 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1603 xspf = self._download_xml(
1604 playlist_url, playlist_id, 'Downloading xpsf playlist',
1605 'Unable to download xspf manifest', fatal=fatal)
1608 return self._parse_xspf(xspf, playlist_id)
1610 def _parse_xspf(self, playlist, playlist_id):
1612 'xspf': 'http://xspf.org/ns/0/',
1613 's1': 'http://static.streamone.nl/player/ns/0',
1617 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1619 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1620 description = xpath_text(
1621 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1622 thumbnail = xpath_text(
1623 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1624 duration = float_or_none(
1625 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1628 'url': location.text,
1629 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1630 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1631 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1632 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1633 self._sort_formats(formats)
1638 'description': description,
1639 'thumbnail': thumbnail,
1640 'duration': duration,
1645 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1646 res = self._download_webpage_handle(
1648 note=note or 'Downloading MPD manifest',
1649 errnote=errnote or 'Failed to download MPD manifest',
1654 mpd_base_url = base_url(urlh.geturl())
1656 return self._parse_mpd_formats(
1657 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1658 formats_dict=formats_dict, mpd_url=mpd_url)
1660 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1662 Parse formats from MPD manifest.
1664 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1665 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1666 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1668 if mpd_doc.get('type') == 'dynamic':
1671 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1674 return self._xpath_ns(path, namespace)
1676 def is_drm_protected(element):
1677 return element.find(_add_ns('ContentProtection')) is not None
1679 def extract_multisegment_info(element, ms_parent_info):
1680 ms_info = ms_parent_info.copy()
1682 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1683 # common attributes and elements. We will only extract relevant
1685 def extract_common(source):
1686 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1687 if segment_timeline is not None:
1688 s_e = segment_timeline.findall(_add_ns('S'))
1690 ms_info['total_number'] = 0
1693 r = int(s.get('r', 0))
1694 ms_info['total_number'] += 1 + r
1695 ms_info['s'].append({
1696 't': int(s.get('t', 0)),
1697 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1698 'd': int(s.attrib['d']),
1701 start_number = source.get('startNumber')
1703 ms_info['start_number'] = int(start_number)
1704 timescale = source.get('timescale')
1706 ms_info['timescale'] = int(timescale)
1707 segment_duration = source.get('duration')
1708 if segment_duration:
1709 ms_info['segment_duration'] = int(segment_duration)
1711 def extract_Initialization(source):
1712 initialization = source.find(_add_ns('Initialization'))
1713 if initialization is not None:
1714 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1716 segment_list = element.find(_add_ns('SegmentList'))
1717 if segment_list is not None:
1718 extract_common(segment_list)
1719 extract_Initialization(segment_list)
1720 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1722 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1724 segment_template = element.find(_add_ns('SegmentTemplate'))
1725 if segment_template is not None:
1726 extract_common(segment_template)
1727 media = segment_template.get('media')
1729 ms_info['media'] = media
1730 initialization = segment_template.get('initialization')
1732 ms_info['initialization'] = initialization
1734 extract_Initialization(segment_template)
1737 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1739 for period in mpd_doc.findall(_add_ns('Period')):
1740 period_duration = parse_duration(period.get('duration')) or mpd_duration
1741 period_ms_info = extract_multisegment_info(period, {
1745 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1746 if is_drm_protected(adaptation_set):
1748 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1749 for representation in adaptation_set.findall(_add_ns('Representation')):
1750 if is_drm_protected(representation):
1752 representation_attrib = adaptation_set.attrib.copy()
1753 representation_attrib.update(representation.attrib)
1754 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1755 mime_type = representation_attrib['mimeType']
1756 content_type = mime_type.split('/')[0]
1757 if content_type == 'text':
1758 # TODO implement WebVTT downloading
1760 elif content_type == 'video' or content_type == 'audio':
1762 for element in (representation, adaptation_set, period, mpd_doc):
1763 base_url_e = element.find(_add_ns('BaseURL'))
1764 if base_url_e is not None:
1765 base_url = base_url_e.text + base_url
1766 if re.match(r'^https?://', base_url):
1768 if mpd_base_url and not re.match(r'^https?://', base_url):
1769 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1771 base_url = mpd_base_url + base_url
1772 representation_id = representation_attrib.get('id')
1773 lang = representation_attrib.get('lang')
1774 url_el = representation.find(_add_ns('BaseURL'))
1775 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1776 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1778 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1780 'manifest_url': mpd_url,
1781 'ext': mimetype2ext(mime_type),
1782 'width': int_or_none(representation_attrib.get('width')),
1783 'height': int_or_none(representation_attrib.get('height')),
1784 'tbr': int_or_none(bandwidth, 1000),
1785 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1786 'fps': int_or_none(representation_attrib.get('frameRate')),
1787 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1788 'format_note': 'DASH %s' % content_type,
1789 'filesize': filesize,
1791 f.update(parse_codecs(representation_attrib.get('codecs')))
1792 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1794 def prepare_template(template_name, identifiers):
1795 t = representation_ms_info[template_name]
1796 t = t.replace('$RepresentationID$', representation_id)
1797 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1798 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1799 t.replace('$$', '$')
1802 # @initialization is a regular template like @media one
1803 # so it should be handled just the same way (see
1804 # https://github.com/rg3/youtube-dl/issues/11605)
1805 if 'initialization' in representation_ms_info:
1806 initialization_template = prepare_template(
1808 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1809 # $Time$ shall not be included for @initialization thus
1810 # only $Bandwidth$ remains
1812 representation_ms_info['initialization_url'] = initialization_template % {
1813 'Bandwidth': bandwidth,
1816 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1818 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1820 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1821 # can't be used at the same time
1822 if '%(Number' in media_template and 's' not in representation_ms_info:
1823 segment_duration = None
1824 if 'total_number' not in representation_ms_info and 'segment_duration':
1825 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1826 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1827 representation_ms_info['fragments'] = [{
1828 'url': media_template % {
1829 'Number': segment_number,
1830 'Bandwidth': bandwidth,
1832 'duration': segment_duration,
1833 } for segment_number in range(
1834 representation_ms_info['start_number'],
1835 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1837 # $Number*$ or $Time$ in media template with S list available
1838 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1839 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1840 representation_ms_info['fragments'] = []
1843 segment_number = representation_ms_info['start_number']
1845 def add_segment_url():
1846 segment_url = media_template % {
1847 'Time': segment_time,
1848 'Bandwidth': bandwidth,
1849 'Number': segment_number,
1851 representation_ms_info['fragments'].append({
1853 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1856 for num, s in enumerate(representation_ms_info['s']):
1857 segment_time = s.get('t') or segment_time
1861 for r in range(s.get('r', 0)):
1862 segment_time += segment_d
1865 segment_time += segment_d
1866 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1868 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1869 # or any YouTube dashsegments video
1872 timescale = representation_ms_info['timescale']
1873 for s in representation_ms_info['s']:
1874 duration = float_or_none(s['d'], timescale)
1875 for r in range(s.get('r', 0) + 1):
1877 'url': representation_ms_info['segment_urls'][segment_index],
1878 'duration': duration,
1881 representation_ms_info['fragments'] = fragments
1882 # NB: MPD manifest may contain direct URLs to unfragmented media.
1883 # No fragments key is present in this case.
1884 if 'fragments' in representation_ms_info:
1887 'protocol': 'http_dash_segments',
1889 if 'initialization_url' in representation_ms_info:
1890 initialization_url = representation_ms_info['initialization_url']
1891 if not f.get('url'):
1892 f['url'] = initialization_url
1893 f['fragments'].append({'url': initialization_url})
1894 f['fragments'].extend(representation_ms_info['fragments'])
1895 for fragment in f['fragments']:
1896 fragment['url'] = urljoin(base_url, fragment['url'])
1898 existing_format = next(
1899 fo for fo in formats
1900 if fo['format_id'] == representation_id)
1901 except StopIteration:
1902 full_info = formats_dict.get(representation_id, {}).copy()
1904 formats.append(full_info)
1906 existing_format.update(f)
1908 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1911 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1912 res = self._download_webpage_handle(
1914 note=note or 'Downloading ISM manifest',
1915 errnote=errnote or 'Failed to download ISM manifest',
1921 return self._parse_ism_formats(
1922 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1924 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1925 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1928 duration = int(ism_doc.attrib['Duration'])
1929 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1932 for stream in ism_doc.findall('StreamIndex'):
1933 stream_type = stream.get('Type')
1934 if stream_type not in ('video', 'audio'):
1936 url_pattern = stream.attrib['Url']
1937 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1938 stream_name = stream.get('Name')
1939 for track in stream.findall('QualityLevel'):
1940 fourcc = track.get('FourCC')
1941 # TODO: add support for WVC1 and WMAP
1942 if fourcc not in ('H264', 'AVC1', 'AACL'):
1943 self.report_warning('%s is not a supported codec' % fourcc)
1945 tbr = int(track.attrib['Bitrate']) // 1000
1946 width = int_or_none(track.get('MaxWidth'))
1947 height = int_or_none(track.get('MaxHeight'))
1948 sampling_rate = int_or_none(track.get('SamplingRate'))
1950 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1951 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1957 stream_fragments = stream.findall('c')
1958 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1959 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1960 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1961 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1962 if not fragment_ctx['duration']:
1964 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1966 next_fragment_time = duration
1967 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1968 for _ in range(fragment_repeat):
1970 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1971 'duration': fragment_ctx['duration'] / stream_timescale,
1973 fragment_ctx['time'] += fragment_ctx['duration']
1977 format_id.append(ism_id)
1979 format_id.append(stream_name)
1980 format_id.append(compat_str(tbr))
1983 'format_id': '-'.join(format_id),
1985 'manifest_url': ism_url,
1986 'ext': 'ismv' if stream_type == 'video' else 'isma',
1990 'asr': sampling_rate,
1991 'vcodec': 'none' if stream_type == 'audio' else fourcc,
1992 'acodec': 'none' if stream_type == 'video' else fourcc,
1994 'fragments': fragments,
1995 '_download_params': {
1996 'duration': duration,
1997 'timescale': stream_timescale,
1998 'width': width or 0,
1999 'height': height or 0,
2001 'codec_private_data': track.get('CodecPrivateData'),
2002 'sampling_rate': sampling_rate,
2003 'channels': int_or_none(track.get('Channels', 2)),
2004 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2005 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2010 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
2011 def absolute_url(video_url):
2012 return compat_urlparse.urljoin(base_url, video_url)
2014 def parse_content_type(content_type):
2015 if not content_type:
2017 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2019 mimetype, codecs = ctr.groups()
2020 f = parse_codecs(codecs)
2021 f['ext'] = mimetype2ext(mimetype)
2025 def _media_formats(src, cur_media_type):
2026 full_url = absolute_url(src)
2027 ext = determine_ext(full_url)
2029 is_plain_url = False
2030 formats = self._extract_m3u8_formats(
2031 full_url, video_id, ext='mp4',
2032 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
2034 is_plain_url = False
2035 formats = self._extract_mpd_formats(
2036 full_url, video_id, mpd_id=mpd_id)
2041 'vcodec': 'none' if cur_media_type == 'audio' else None,
2043 return is_plain_url, formats
2046 media_tags = [(media_tag, media_type, '')
2047 for media_tag, media_type
2048 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2049 media_tags.extend(re.findall(
2050 # We only allow video|audio followed by a whitespace or '>'.
2051 # Allowing more characters may end up in significant slow down (see
2052 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2053 # http://www.porntrex.com/maps/videositemap.xml).
2054 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2055 for media_tag, media_type, media_content in media_tags:
2060 media_attributes = extract_attributes(media_tag)
2061 src = media_attributes.get('src')
2063 _, formats = _media_formats(src, media_type)
2064 media_info['formats'].extend(formats)
2065 media_info['thumbnail'] = media_attributes.get('poster')
2067 for source_tag in re.findall(r'<source[^>]+>', media_content):
2068 source_attributes = extract_attributes(source_tag)
2069 src = source_attributes.get('src')
2072 is_plain_url, formats = _media_formats(src, media_type)
2074 f = parse_content_type(source_attributes.get('type'))
2075 f.update(formats[0])
2076 media_info['formats'].append(f)
2078 media_info['formats'].extend(formats)
2079 for track_tag in re.findall(r'<track[^>]+>', media_content):
2080 track_attributes = extract_attributes(track_tag)
2081 kind = track_attributes.get('kind')
2082 if not kind or kind in ('subtitles', 'captions'):
2083 src = track_attributes.get('src')
2086 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2087 media_info['subtitles'].setdefault(lang, []).append({
2088 'url': absolute_url(src),
2090 if media_info['formats'] or media_info['subtitles']:
2091 entries.append(media_info)
2094 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2096 hdcore_sign = 'hdcore=3.7.0'
2097 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2098 hds_host = hosts.get('hds')
2100 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2101 if 'hdcore=' not in f4m_url:
2102 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2103 f4m_formats = self._extract_f4m_formats(
2104 f4m_url, video_id, f4m_id='hds', fatal=False)
2105 for entry in f4m_formats:
2106 entry.update({'extra_param_to_segment_url': hdcore_sign})
2107 formats.extend(f4m_formats)
2108 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2109 hls_host = hosts.get('hls')
2111 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2112 formats.extend(self._extract_m3u8_formats(
2113 m3u8_url, video_id, 'mp4', 'm3u8_native',
2114 m3u8_id='hls', fatal=False))
2117 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2118 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2119 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2120 http_base_url = 'http' + url_base
2122 if 'm3u8' not in skip_protocols:
2123 formats.extend(self._extract_m3u8_formats(
2124 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2125 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2126 if 'f4m' not in skip_protocols:
2127 formats.extend(self._extract_f4m_formats(
2128 http_base_url + '/manifest.f4m',
2129 video_id, f4m_id='hds', fatal=False))
2130 if 'dash' not in skip_protocols:
2131 formats.extend(self._extract_mpd_formats(
2132 http_base_url + '/manifest.mpd',
2133 video_id, mpd_id='dash', fatal=False))
2134 if re.search(r'(?:/smil:|\.smil)', url_base):
2135 if 'smil' not in skip_protocols:
2136 rtmp_formats = self._extract_smil_formats(
2137 http_base_url + '/jwplayer.smil',
2138 video_id, fatal=False)
2139 for rtmp_format in rtmp_formats:
2140 rtsp_format = rtmp_format.copy()
2141 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2142 del rtsp_format['play_path']
2143 del rtsp_format['ext']
2144 rtsp_format.update({
2145 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2146 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2149 formats.extend([rtmp_format, rtsp_format])
2151 for protocol in ('rtmp', 'rtsp'):
2152 if protocol not in skip_protocols:
2154 'url': protocol + url_base,
2155 'format_id': protocol,
2156 'protocol': protocol,
2161 def _find_jwplayer_data(webpage):
2163 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2166 return mobj.group('options')
2168 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2169 jwplayer_data = self._parse_json(
2170 self._find_jwplayer_data(webpage), video_id,
2171 transform_source=js_to_json)
2172 return self._parse_jwplayer_data(
2173 jwplayer_data, video_id, *args, **kwargs)
2175 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2176 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2177 # JWPlayer backward compatibility: flattened playlists
2178 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2179 if 'playlist' not in jwplayer_data:
2180 jwplayer_data = {'playlist': [jwplayer_data]}
2184 # JWPlayer backward compatibility: single playlist item
2185 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2186 if not isinstance(jwplayer_data['playlist'], list):
2187 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2189 for video_data in jwplayer_data['playlist']:
2190 # JWPlayer backward compatibility: flattened sources
2191 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2192 if 'sources' not in video_data:
2193 video_data['sources'] = [video_data]
2195 this_video_id = video_id or video_data['mediaid']
2198 for source in video_data['sources']:
2199 source_url = self._proto_relative_url(source['file'])
2201 source_url = compat_urlparse.urljoin(base_url, source_url)
2202 source_type = source.get('type') or ''
2203 ext = mimetype2ext(source_type) or determine_ext(source_url)
2204 if source_type == 'hls' or ext == 'm3u8':
2205 formats.extend(self._extract_m3u8_formats(
2206 source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2208 formats.extend(self._extract_mpd_formats(
2209 source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2210 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2211 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2218 height = int_or_none(source.get('height'))
2220 # Often no height is provided but there is a label in
2221 # format like 1080p.
2222 height = int_or_none(self._search_regex(
2223 r'^(\d{3,})[pP]$', source.get('label') or '',
2224 'height', default=None))
2227 'width': int_or_none(source.get('width')),
2231 if source_url.startswith('rtmp'):
2232 a_format['ext'] = 'flv'
2234 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2235 # of jwplayer.flash.swf
2236 rtmp_url_parts = re.split(
2237 r'((?:mp4|mp3|flv):)', source_url, 1)
2238 if len(rtmp_url_parts) == 3:
2239 rtmp_url, prefix, play_path = rtmp_url_parts
2242 'play_path': prefix + play_path,
2245 a_format.update(rtmp_params)
2246 formats.append(a_format)
2247 self._sort_formats(formats)
2250 tracks = video_data.get('tracks')
2251 if tracks and isinstance(tracks, list):
2252 for track in tracks:
2253 if track.get('kind') != 'captions':
2255 track_url = urljoin(base_url, track.get('file'))
2258 subtitles.setdefault(track.get('label') or 'en', []).append({
2259 'url': self._proto_relative_url(track_url)
2263 'id': this_video_id,
2264 'title': video_data['title'] if require_title else video_data.get('title'),
2265 'description': video_data.get('description'),
2266 'thumbnail': self._proto_relative_url(video_data.get('image')),
2267 'timestamp': int_or_none(video_data.get('pubdate')),
2268 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2269 'subtitles': subtitles,
2272 if len(entries) == 1:
2275 return self.playlist_result(entries)
2277 def _live_title(self, name):
2278 """ Generate the title for a live video """
2279 now = datetime.datetime.now()
2280 now_str = now.strftime('%Y-%m-%d %H:%M')
2281 return name + ' ' + now_str
2283 def _int(self, v, name, fatal=False, **kwargs):
2284 res = int_or_none(v, **kwargs)
2285 if 'get_attr' in kwargs:
2286 print(getattr(v, kwargs['get_attr']))
2288 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2290 raise ExtractorError(msg)
2292 self._downloader.report_warning(msg)
2295 def _float(self, v, name, fatal=False, **kwargs):
2296 res = float_or_none(v, **kwargs)
2298 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2300 raise ExtractorError(msg)
2302 self._downloader.report_warning(msg)
2305 def _set_cookie(self, domain, name, value, expire_time=None):
2306 cookie = compat_cookiejar.Cookie(
2307 0, name, value, None, None, domain, None,
2308 None, '/', True, False, expire_time, '', None, None, None)
2309 self._downloader.cookiejar.set_cookie(cookie)
2311 def _get_cookies(self, url):
2312 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2313 req = sanitized_Request(url)
2314 self._downloader.cookiejar.add_cookie_header(req)
2315 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2317 def get_testcases(self, include_onlymatching=False):
2318 t = getattr(self, '_TEST', None)
2320 assert not hasattr(self, '_TESTS'), \
2321 '%s has _TEST and _TESTS' % type(self).__name__
2324 tests = getattr(self, '_TESTS', [])
2326 if not include_onlymatching and t.get('only_matching', False):
2328 t['name'] = type(self).__name__[:-len('IE')]
2331 def is_suitable(self, age_limit):
2332 """ Test whether the extractor is generally suitable for the given
2333 age limit (i.e. pornographic sites are not, all others usually are) """
2335 any_restricted = False
2336 for tc in self.get_testcases(include_onlymatching=False):
2337 if tc.get('playlist', []):
2338 tc = tc['playlist'][0]
2339 is_restricted = age_restricted(
2340 tc.get('info_dict', {}).get('age_limit'), age_limit)
2341 if not is_restricted:
2343 any_restricted = any_restricted or is_restricted
2344 return not any_restricted
2346 def extract_subtitles(self, *args, **kwargs):
2347 if (self._downloader.params.get('writesubtitles', False) or
2348 self._downloader.params.get('listsubtitles')):
2349 return self._get_subtitles(*args, **kwargs)
2352 def _get_subtitles(self, *args, **kwargs):
2353 raise NotImplementedError('This method must be implemented by subclasses')
2356 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2357 """ Merge subtitle items for one language. Items with duplicated URLs
2358 will be dropped. """
2359 list1_urls = set([item['url'] for item in subtitle_list1])
2360 ret = list(subtitle_list1)
2361 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2365 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2366 """ Merge two subtitle dictionaries, language by language. """
2367 ret = dict(subtitle_dict1)
2368 for lang in subtitle_dict2:
2369 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2372 def extract_automatic_captions(self, *args, **kwargs):
2373 if (self._downloader.params.get('writeautomaticsub', False) or
2374 self._downloader.params.get('listsubtitles')):
2375 return self._get_automatic_captions(*args, **kwargs)
2378 def _get_automatic_captions(self, *args, **kwargs):
2379 raise NotImplementedError('This method must be implemented by subclasses')
2381 def mark_watched(self, *args, **kwargs):
2382 if (self._downloader.params.get('mark_watched', False) and
2383 (self._get_login_info()[0] is not None or
2384 self._downloader.params.get('cookiefile') is not None)):
2385 self._mark_watched(*args, **kwargs)
2387 def _mark_watched(self, *args, **kwargs):
2388 raise NotImplementedError('This method must be implemented by subclasses')
2390 def geo_verification_headers(self):
2392 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2393 if geo_verification_proxy:
2394 headers['Ytdl-request-proxy'] = geo_verification_proxy
2397 def _generic_id(self, url):
2398 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2400 def _generic_title(self, url):
2401 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2404 class SearchInfoExtractor(InfoExtractor):
2406 Base class for paged search queries extractors.
2407 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2408 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2412 def _make_valid_url(cls):
2413 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2416 def suitable(cls, url):
2417 return re.match(cls._make_valid_url(), url) is not None
2419 def _real_extract(self, query):
2420 mobj = re.match(self._make_valid_url(), query)
2422 raise ExtractorError('Invalid search query "%s"' % query)
2424 prefix = mobj.group('prefix')
2425 query = mobj.group('query')
2427 return self._get_n_results(query, 1)
2428 elif prefix == 'all':
2429 return self._get_n_results(query, self._MAX_RESULTS)
2433 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2434 elif n > self._MAX_RESULTS:
2435 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2436 n = self._MAX_RESULTS
2437 return self._get_n_results(query, n)
2439 def _get_n_results(self, query, n):
2440 """Get a specified number of results for a query"""
2441 raise NotImplementedError('This method must be implemented by subclasses')
2444 def SEARCH_KEY(self):
2445 return self._SEARCH_KEY