[extractor/common] Improve m3u8 extraction (closes #12211)
[youtube-dl] / youtube_dl / extractor / common.py
index 6d4789d9636cc6874e2979dcfcbe56839323ad7c..359c549c5c13d8c8f26fd82041166e52d0897ae2 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import base64
 from __future__ import unicode_literals
 
 import base64
@@ -36,34 +37,35 @@ from ..utils import (
     clean_html,
     compiled_regex_type,
     determine_ext,
     clean_html,
     compiled_regex_type,
     determine_ext,
+    determine_protocol,
     error_to_compat_str,
     ExtractorError,
     error_to_compat_str,
     ExtractorError,
+    extract_attributes,
     fix_xml_ampersands,
     float_or_none,
     GeoRestrictedError,
     GeoUtils,
     int_or_none,
     js_to_json,
     fix_xml_ampersands,
     float_or_none,
     GeoRestrictedError,
     GeoUtils,
     int_or_none,
     js_to_json,
+    mimetype2ext,
+    orderedSet,
+    parse_codecs,
+    parse_duration,
     parse_iso8601,
     parse_iso8601,
+    parse_m3u8_attributes,
     RegexNotFoundError,
     RegexNotFoundError,
-    sanitize_filename,
     sanitized_Request,
     sanitized_Request,
+    sanitize_filename,
     unescapeHTML,
     unified_strdate,
     unified_timestamp,
     unescapeHTML,
     unified_strdate,
     unified_timestamp,
+    update_Request,
+    update_url_query,
+    urljoin,
     url_basename,
     xpath_element,
     xpath_text,
     xpath_with_ns,
     url_basename,
     xpath_element,
     xpath_text,
     xpath_with_ns,
-    determine_protocol,
-    parse_duration,
-    mimetype2ext,
-    update_Request,
-    update_url_query,
-    parse_m3u8_attributes,
-    extract_attributes,
-    parse_codecs,
-    urljoin,
 )
 
 
 )
 
 
@@ -393,7 +395,6 @@ class InfoExtractor(object):
         is selected and a random IP belonging to this country is generated. This
         IP will be passed as X-Forwarded-For HTTP header in all subsequent
         HTTP requests.
         is selected and a random IP belonging to this country is generated. This
         IP will be passed as X-Forwarded-For HTTP header in all subsequent
         HTTP requests.
-        Method does nothing if no countries are specified.
 
         This method will be used for initial geo bypass mechanism initialization
         during the instance initialization with _GEO_COUNTRIES.
 
         This method will be used for initial geo bypass mechanism initialization
         during the instance initialization with _GEO_COUNTRIES.
@@ -402,8 +403,6 @@ class InfoExtractor(object):
         information is not available beforehand (e.g. obtained during
         extraction) or due to some another reason.
         """
         information is not available beforehand (e.g. obtained during
         extraction) or due to some another reason.
         """
-        if not countries:
-            return
         if not self._x_forwarded_for_ip:
             country_code = self._downloader.params.get('geo_bypass_country', None)
             # If there is no explicit country for geo bypass specified and
         if not self._x_forwarded_for_ip:
             country_code = self._downloader.params.get('geo_bypass_country', None)
             # If there is no explicit country for geo bypass specified and
@@ -418,7 +417,8 @@ class InfoExtractor(object):
                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_stdout(
                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_stdout(
-                        '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                        '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
+                        % (self._x_forwarded_for_ip, country_code.upper()))
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
@@ -447,10 +447,12 @@ class InfoExtractor(object):
                 self._downloader.params.get('geo_bypass', True) and
                 not self._x_forwarded_for_ip and
                 countries):
                 self._downloader.params.get('geo_bypass', True) and
                 not self._x_forwarded_for_ip and
                 countries):
-            self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
+            country_code = random.choice(countries)
+            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
             if self._x_forwarded_for_ip:
                 self.report_warning(
             if self._x_forwarded_for_ip:
                 self.report_warning(
-                    'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
+                    % (self._x_forwarded_for_ip, country_code.upper()))
                 return True
         return False
 
                 return True
         return False
 
@@ -546,6 +548,34 @@ class InfoExtractor(object):
 
         return encoding
 
 
         return encoding
 
+    def __check_blocked(self, content):
+        first_block = content[:512]
+        if ('<title>Access to this site is blocked</title>' in content and
+                'Websense' in first_block):
+            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += ' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+        if '<title>The URL you requested has been blocked</title>' in first_block:
+            msg = (
+                'Access to this webpage has been blocked by Indian censorship. '
+                'Use a VPN or proxy server (with --proxy) to route around it.')
+            block_msg = self._html_search_regex(
+                r'</h1><p>(.*?)</p>',
+                content, 'block message', default=None)
+            if block_msg:
+                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+            raise ExtractorError(msg, expected=True)
+        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
+                'blocklist.rkn.gov.ru' in content):
+            raise ExtractorError(
+                'Access to this webpage has been blocked by decision of the Russian government. '
+                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+                expected=True)
+
     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
@@ -587,25 +617,7 @@ class InfoExtractor(object):
         except LookupError:
             content = webpage_bytes.decode('utf-8', 'replace')
 
         except LookupError:
             content = webpage_bytes.decode('utf-8', 'replace')
 
-        if ('<title>Access to this site is blocked</title>' in content and
-                'Websense' in content[:512]):
-            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
-            blocked_iframe = self._html_search_regex(
-                r'<iframe src="([^"]+)"', content,
-                'Websense information URL', default=None)
-            if blocked_iframe:
-                msg += ' Visit %s for more details' % blocked_iframe
-            raise ExtractorError(msg, expected=True)
-        if '<title>The URL you requested has been blocked</title>' in content[:512]:
-            msg = (
-                'Access to this webpage has been blocked by Indian censorship. '
-                'Use a VPN or proxy server (with --proxy) to route around it.')
-            block_msg = self._html_search_regex(
-                r'</h1><p>(.*?)</p>',
-                content, 'block message', default=None)
-            if block_msg:
-                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
-            raise ExtractorError(msg, expected=True)
+        self.__check_blocked(content)
 
         return content
 
 
         return content
 
@@ -714,6 +726,13 @@ class InfoExtractor(object):
             video_info['title'] = video_title
         return video_info
 
             video_info['title'] = video_title
         return video_info
 
+    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
+        urlrs = orderedSet(
+            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+            for m in matches)
+        return self.playlist_result(
+            urlrs, playlist_id=video_id, playlist_title=video_title)
+
     @staticmethod
     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
         """Returns a playlist"""
     @staticmethod
     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
         """Returns a playlist"""
@@ -957,6 +976,22 @@ class InfoExtractor(object):
             return info
         if isinstance(json_ld, dict):
             json_ld = [json_ld]
             return info
         if isinstance(json_ld, dict):
             json_ld = [json_ld]
+
+        def extract_video_object(e):
+            assert e['@type'] == 'VideoObject'
+            info.update({
+                'url': e.get('contentUrl'),
+                'title': unescapeHTML(e.get('name')),
+                'description': unescapeHTML(e.get('description')),
+                'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+                'duration': parse_duration(e.get('duration')),
+                'timestamp': unified_timestamp(e.get('uploadDate')),
+                'filesize': float_or_none(e.get('contentSize')),
+                'tbr': int_or_none(e.get('bitrate')),
+                'width': int_or_none(e.get('width')),
+                'height': int_or_none(e.get('height')),
+            })
+
         for e in json_ld:
             if e.get('@context') == 'http://schema.org':
                 item_type = e.get('@type')
         for e in json_ld:
             if e.get('@context') == 'http://schema.org':
                 item_type = e.get('@type')
@@ -981,18 +1016,11 @@ class InfoExtractor(object):
                         'description': unescapeHTML(e.get('articleBody')),
                     })
                 elif item_type == 'VideoObject':
                         'description': unescapeHTML(e.get('articleBody')),
                     })
                 elif item_type == 'VideoObject':
-                    info.update({
-                        'url': e.get('contentUrl'),
-                        'title': unescapeHTML(e.get('name')),
-                        'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
-                        'duration': parse_duration(e.get('duration')),
-                        'timestamp': unified_timestamp(e.get('uploadDate')),
-                        'filesize': float_or_none(e.get('contentSize')),
-                        'tbr': int_or_none(e.get('bitrate')),
-                        'width': int_or_none(e.get('width')),
-                        'height': int_or_none(e.get('height')),
-                    })
+                    extract_video_object(e)
+                elif item_type == 'WebPage':
+                    video = e.get('video')
+                    if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                        extract_video_object(video)
                 break
         return dict((k, v) for k, v in info.items() if v is not None)
 
                 break
         return dict((k, v) for k, v in info.items() if v is not None)
 
@@ -1284,17 +1312,25 @@ class InfoExtractor(object):
                               entry_protocol='m3u8', preference=None,
                               m3u8_id=None, note=None, errnote=None,
                               fatal=True, live=False):
                               entry_protocol='m3u8', preference=None,
                               m3u8_id=None, note=None, errnote=None,
                               fatal=True, live=False):
-
         res = self._download_webpage_handle(
             m3u8_url, video_id,
             note=note or 'Downloading m3u8 information',
             errnote=errnote or 'Failed to download m3u8 information',
             fatal=fatal)
         res = self._download_webpage_handle(
             m3u8_url, video_id,
             note=note or 'Downloading m3u8 information',
             errnote=errnote or 'Failed to download m3u8 information',
             fatal=fatal)
+
         if res is False:
             return []
         if res is False:
             return []
+
         m3u8_doc, urlh = res
         m3u8_url = urlh.geturl()
 
         m3u8_doc, urlh = res
         m3u8_url = urlh.geturl()
 
+        return self._parse_m3u8_formats(
+            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+            preference=preference, m3u8_id=m3u8_id, live=live)
+
+    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+                            entry_protocol='m3u8', preference=None,
+                            m3u8_id=None, live=False):
         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
             return []
 
         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
             return []
 
@@ -1305,19 +1341,21 @@ class InfoExtractor(object):
             if re.match(r'^https?://', u)
             else compat_urlparse.urljoin(m3u8_url, u))
 
             if re.match(r'^https?://', u)
             else compat_urlparse.urljoin(m3u8_url, u))
 
-        # We should try extracting formats only from master playlists [1], i.e.
-        # playlists that describe available qualities. On the other hand media
-        # playlists [2] should be returned as is since they contain just the media
-        # without qualities renditions.
+        # References:
+        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+        # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+        # We should try extracting formats only from master playlists [1, 4.3.4],
+        # i.e. playlists that describe available qualities. On the other hand
+        # media playlists [1, 4.3.3] should be returned as is since they contain
+        # just the media without qualities renditions.
         # Fortunately, master playlist can be easily distinguished from media
         # Fortunately, master playlist can be easily distinguished from media
-        # playlist based on particular tags availability. As of [1, 2] master
-        # playlist tags MUST NOT appear in a media playist and vice versa.
-        # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
-        # and MUST NOT appear in master playlist thus we can clearly detect media
-        # playlist with this criterion.
-        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
-        # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
-        # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+        # master playlist tags MUST NOT appear in a media playist and vice versa.
+        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+        # media playlist and MUST NOT appear in master playlist thus we can
+        # clearly detect media playlist with this criterion.
+
         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
             return [{
                 'url': m3u8_url,
         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
             return [{
                 'url': m3u8_url,
@@ -1326,52 +1364,67 @@ class InfoExtractor(object):
                 'protocol': entry_protocol,
                 'preference': preference,
             }]
                 'protocol': entry_protocol,
                 'preference': preference,
             }]
-        audio_in_video_stream = {}
-        last_info = {}
-        last_media = {}
+
+        groups = {}
+        last_stream_inf = {}
+
+        def extract_media(x_media_line):
+            media = parse_m3u8_attributes(x_media_line)
+            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+            if not (media_type and group_id and name):
+                return
+            groups.setdefault(group_id, []).append(media)
+            if media_type not in ('VIDEO', 'AUDIO'):
+                return
+            media_url = media.get('URI')
+            if media_url:
+                format_id = []
+                for v in (group_id, name):
+                    if v:
+                        format_id.append(v)
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'url': format_url(media_url),
+                    'language': media.get('LANGUAGE'),
+                    'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
+                }
+                if media_type == 'AUDIO':
+                    f['vcodec'] = 'none'
+                formats.append(f)
+
+        def build_stream_name():
+            # Despite specification does not mention NAME attribute for
+            # EXT-X-STREAM-INF it still sometimes may be present
+            stream_name = last_stream_inf.get('NAME')
+            if stream_name:
+                return stream_name
+            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+            # from corresponding rendition group
+            stream_group_id = last_stream_inf.get('VIDEO')
+            if not stream_group_id:
+                return
+            stream_group = groups.get(stream_group_id)
+            if not stream_group:
+                return stream_group_id
+            rendition = stream_group[0]
+            return rendition.get('NAME') or stream_group_id
+
         for line in m3u8_doc.splitlines():
             if line.startswith('#EXT-X-STREAM-INF:'):
         for line in m3u8_doc.splitlines():
             if line.startswith('#EXT-X-STREAM-INF:'):
-                last_info = parse_m3u8_attributes(line)
+                last_stream_inf = parse_m3u8_attributes(line)
             elif line.startswith('#EXT-X-MEDIA:'):
             elif line.startswith('#EXT-X-MEDIA:'):
-                media = parse_m3u8_attributes(line)
-                media_type = media.get('TYPE')
-                if media_type in ('VIDEO', 'AUDIO'):
-                    group_id = media.get('GROUP-ID')
-                    media_url = media.get('URI')
-                    if media_url:
-                        format_id = []
-                        for v in (group_id, media.get('NAME')):
-                            if v:
-                                format_id.append(v)
-                        f = {
-                            'format_id': '-'.join(format_id),
-                            'url': format_url(media_url),
-                            'language': media.get('LANGUAGE'),
-                            'ext': ext,
-                            'protocol': entry_protocol,
-                            'preference': preference,
-                        }
-                        if media_type == 'AUDIO':
-                            f['vcodec'] = 'none'
-                            if group_id and not audio_in_video_stream.get(group_id):
-                                audio_in_video_stream[group_id] = False
-                        formats.append(f)
-                    else:
-                        # When there is no URI in EXT-X-MEDIA let this tag's
-                        # data be used by regular URI lines below
-                        last_media = media
-                        if media_type == 'AUDIO' and group_id:
-                            audio_in_video_stream[group_id] = True
+                extract_media(line)
             elif line.startswith('#') or not line.strip():
                 continue
             else:
             elif line.startswith('#') or not line.strip():
                 continue
             else:
-                tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+                tbr = int_or_none(last_stream_inf.get('AVERAGE-BANDWIDTH') or last_stream_inf.get('BANDWIDTH'), scale=1000)
                 format_id = []
                 if m3u8_id:
                     format_id.append(m3u8_id)
                 format_id = []
                 if m3u8_id:
                     format_id.append(m3u8_id)
-                # Despite specification does not mention NAME attribute for
-                # EXT-X-STREAM-INF it still sometimes may be present
-                stream_name = last_info.get('NAME') or last_media.get('NAME')
+                stream_name = build_stream_name()
                 # Bandwidth of live streams may differ over time thus making
                 # format_id unpredictable. So it's better to keep provided
                 # format_id intact.
                 # Bandwidth of live streams may differ over time thus making
                 # format_id unpredictable. So it's better to keep provided
                 # format_id intact.
@@ -1384,11 +1437,11 @@ class InfoExtractor(object):
                     'manifest_url': manifest_url,
                     'tbr': tbr,
                     'ext': ext,
                     'manifest_url': manifest_url,
                     'tbr': tbr,
                     'ext': ext,
-                    'fps': float_or_none(last_info.get('FRAME-RATE')),
+                    'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
                     'protocol': entry_protocol,
                     'preference': preference,
                 }
                     'protocol': entry_protocol,
                     'preference': preference,
                 }
-                resolution = last_info.get('RESOLUTION')
+                resolution = last_stream_inf.get('RESOLUTION')
                 if resolution:
                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
                     if mobj:
                 if resolution:
                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
                     if mobj:
@@ -1404,13 +1457,26 @@ class InfoExtractor(object):
                         'vbr': vbr,
                         'abr': abr,
                     })
                         'vbr': vbr,
                         'abr': abr,
                     })
-                f.update(parse_codecs(last_info.get('CODECS')))
-                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
-                    # TODO: update acodec for audio only formats with the same GROUP-ID
-                    f['acodec'] = 'none'
+                codecs = parse_codecs(last_stream_inf.get('CODECS'))
+                f.update(codecs)
+                audio_group_id = last_stream_inf.get('AUDIO')
+                # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+                # references a rendition group MUST have a CODECS attribute.
+                # However, this is not always respected, for example, [2]
+                # contains EXT-X-STREAM-INF tag which references AUDIO
+                # rendition group but does not have CODECS and despite
+                # referencing audio group an audio group, it represents
+                # a complete (with audio and video) format. So, for such cases
+                # we will ignore references to rendition groups and treat them
+                # as complete formats.
+                if audio_group_id and codecs and f.get('vcodec') != 'none':
+                    audio_group = groups.get(audio_group_id)
+                    if audio_group and audio_group[0].get('URI'):
+                        # TODO: update acodec for audio only formats with
+                        # the same GROUP-ID
+                        f['acodec'] = 'none'
                 formats.append(f)
                 formats.append(f)
-                last_info = {}
-                last_media = {}
+                last_stream_inf = {}
         return formats
 
     @staticmethod
         return formats
 
     @staticmethod
@@ -1760,7 +1826,7 @@ class InfoExtractor(object):
                     if content_type == 'text':
                         # TODO implement WebVTT downloading
                         pass
                     if content_type == 'text':
                         # TODO implement WebVTT downloading
                         pass
-                    elif content_type == 'video' or content_type == 'audio':
+                    elif content_type in ('video', 'audio'):
                         base_url = ''
                         for element in (representation, adaptation_set, period, mpd_doc):
                             base_url_e = element.find(_add_ns('BaseURL'))
                         base_url = ''
                         for element in (representation, adaptation_set, period, mpd_doc):
                             base_url_e = element.find(_add_ns('BaseURL'))
@@ -2010,7 +2076,7 @@ class InfoExtractor(object):
                 })
         return formats
 
                 })
         return formats
 
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
         def absolute_url(video_url):
             return compat_urlparse.urljoin(base_url, video_url)
 
         def absolute_url(video_url):
             return compat_urlparse.urljoin(base_url, video_url)
 
@@ -2032,7 +2098,8 @@ class InfoExtractor(object):
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
-                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+                    preference=preference)
             elif ext == 'mpd':
                 is_plain_url = False
                 formats = self._extract_mpd_formats(
             elif ext == 'mpd':
                 is_plain_url = False
                 formats = self._extract_mpd_formats(
@@ -2160,18 +2227,24 @@ class InfoExtractor(object):
                     })
         return formats
 
                     })
         return formats
 
-    @staticmethod
-    def _find_jwplayer_data(webpage):
+    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
         mobj = re.search(
         mobj = re.search(
-            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\).*?\.setup\s*\((?P<options>[^)]+)\)',
             webpage)
         if mobj:
             webpage)
         if mobj:
-            return mobj.group('options')
+            try:
+                jwplayer_data = self._parse_json(mobj.group('options'),
+                                                 video_id=video_id,
+                                                 transform_source=transform_source)
+            except ExtractorError:
+                pass
+            else:
+                if isinstance(jwplayer_data, dict):
+                    return jwplayer_data
 
     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
 
     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-        jwplayer_data = self._parse_json(
-            self._find_jwplayer_data(webpage), video_id,
-            transform_source=js_to_json)
+        jwplayer_data = self._find_jwplayer_data(
+            webpage, video_id, transform_source=js_to_json)
         return self._parse_jwplayer_data(
             jwplayer_data, video_id, *args, **kwargs)
 
         return self._parse_jwplayer_data(
             jwplayer_data, video_id, *args, **kwargs)
 
@@ -2197,56 +2270,9 @@ class InfoExtractor(object):
 
             this_video_id = video_id or video_data['mediaid']
 
 
             this_video_id = video_id or video_data['mediaid']
 
-            formats = []
-            for source in video_data['sources']:
-                source_url = self._proto_relative_url(source['file'])
-                if base_url:
-                    source_url = compat_urlparse.urljoin(base_url, source_url)
-                source_type = source.get('type') or ''
-                ext = mimetype2ext(source_type) or determine_ext(source_url)
-                if source_type == 'hls' or ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
-                elif ext == 'mpd':
-                    formats.extend(self._extract_mpd_formats(
-                        source_url, this_video_id, mpd_id=mpd_id, fatal=False))
-                # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
-                elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
-                    formats.append({
-                        'url': source_url,
-                        'vcodec': 'none',
-                        'ext': ext,
-                    })
-                else:
-                    height = int_or_none(source.get('height'))
-                    if height is None:
-                        # Often no height is provided but there is a label in
-                        # format like 1080p.
-                        height = int_or_none(self._search_regex(
-                            r'^(\d{3,})[pP]$', source.get('label') or '',
-                            'height', default=None))
-                    a_format = {
-                        'url': source_url,
-                        'width': int_or_none(source.get('width')),
-                        'height': height,
-                        'ext': ext,
-                    }
-                    if source_url.startswith('rtmp'):
-                        a_format['ext'] = 'flv'
-
-                        # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
-                        # of jwplayer.flash.swf
-                        rtmp_url_parts = re.split(
-                            r'((?:mp4|mp3|flv):)', source_url, 1)
-                        if len(rtmp_url_parts) == 3:
-                            rtmp_url, prefix, play_path = rtmp_url_parts
-                            a_format.update({
-                                'url': rtmp_url,
-                                'play_path': prefix + play_path,
-                            })
-                        if rtmp_params:
-                            a_format.update(rtmp_params)
-                    formats.append(a_format)
+            formats = self._parse_jwplayer_formats(
+                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
             self._sort_formats(formats)
 
             subtitles = {}
             self._sort_formats(formats)
 
             subtitles = {}
@@ -2277,6 +2303,71 @@ class InfoExtractor(object):
         else:
             return self.playlist_result(entries)
 
         else:
             return self.playlist_result(entries)
 
+    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
+                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
+        formats = []
+        for source in jwplayer_sources_data:
+            source_url = self._proto_relative_url(source.get('file'))
+            if not source_url:
+                continue
+            if base_url:
+                source_url = compat_urlparse.urljoin(base_url, source_url)
+            if source_url in urls:
+                continue
+            urls.append(source_url)
+            source_type = source.get('type') or ''
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if source_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=m3u8_id, fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    source_url, video_id, mpd_id=mpd_id, fatal=False))
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    source_url, video_id, fatal=False))
+            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+            elif source_type.startswith('audio') or ext in (
+                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+                formats.append({
+                    'url': source_url,
+                    'vcodec': 'none',
+                    'ext': ext,
+                })
+            else:
+                height = int_or_none(source.get('height'))
+                if height is None:
+                    # Often no height is provided but there is a label in
+                    # format like "1080p", "720p SD", or 1080.
+                    height = int_or_none(self._search_regex(
+                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
+                        'height', default=None))
+                a_format = {
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'height': height,
+                    'tbr': int_or_none(source.get('bitrate')),
+                    'ext': ext,
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv'
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
+        return formats
+
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()