Merge pull request #14225 from Tithen-Firion/openload-phantomjs-method
authorYen Chi Hsuan <yan12125@gmail.com>
Fri, 15 Sep 2017 18:28:28 +0000 (02:28 +0800)
committerGitHub <noreply@github.com>
Fri, 15 Sep 2017 18:28:28 +0000 (02:28 +0800)
Openload phantomjs method

1  2 
youtube_dl/YoutubeDL.py
youtube_dl/extractor/common.py
youtube_dl/utils.py

diff --combined youtube_dl/YoutubeDL.py
index 5405a87c5f377d7673b08e5560546d639614a6ca,033b50702c7cdb8056f11e2f60f96363416b4c2e..bfb4ff225f2376c09b2e595276244d887a5e879c
@@@ -26,8 -26,6 +26,8 @@@ import tokeniz
  import traceback
  import random
  
 +from string import ascii_letters
 +
  from .compat import (
      compat_basestring,
      compat_cookiejar,
@@@ -60,7 -58,6 +60,7 @@@ from .utils import 
      format_bytes,
      formatSeconds,
      GeoRestrictedError,
 +    int_or_none,
      ISO3166Utils,
      locked_file,
      make_HTTPS_handler,
@@@ -89,6 -86,7 +89,7 @@@
      write_string,
      YoutubeDLCookieProcessor,
      YoutubeDLHandler,
+     PhantomJSwrapper,
  )
  from .cache import Cache
  from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
@@@ -305,17 -303,6 +306,17 @@@ class YoutubeDL(object)
                          postprocessor.
      """
  
 +    _NUMERIC_FIELDS = set((
 +        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 +        'timestamp', 'upload_year', 'upload_month', 'upload_day',
 +        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 +        'average_rating', 'comment_count', 'age_limit',
 +        'start_time', 'end_time',
 +        'chapter_number', 'season_number', 'episode_number',
 +        'track_number', 'disc_number', 'release_year',
 +        'playlist_index',
 +    ))
 +
      params = None
      _ies = []
      _pps = []
                  else:
                      raise
  
 -        if (sys.version_info >= (3,) and sys.platform != 'win32' and
 +        if (sys.platform != 'win32' and
                  sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
                  not params.get('restrictfilenames', False)):
 -            # On Python 3, the Unicode filesystem API will throw errors (#1474)
 +            # Unicode filesystem API will throw errors (#1474, #13027)
              self.report_warning(
                  'Assuming --restrict-filenames since file system encoding '
                  'cannot encode all characters. '
      def to_console_title(self, message):
          if not self.params.get('consoletitle', False):
              return
 -        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 -            # c_wchar_p() might not be necessary if `message` is
 -            # already of type unicode()
 -            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 +        if compat_os_name == 'nt':
 +            if ctypes.windll.kernel32.GetConsoleWindow():
 +                # c_wchar_p() might not be necessary if `message` is
 +                # already of type unicode()
 +                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
          elif 'TERM' in os.environ:
              self._write_string('\033]0;%s\007' % message, self._screen_file)
  
      def save_console_title(self):
          if not self.params.get('consoletitle', False):
              return
 -        if 'TERM' in os.environ:
 +        if compat_os_name != 'nt' and 'TERM' in os.environ:
              # Save the title on stack
              self._write_string('\033[22;0t', self._screen_file)
  
      def restore_console_title(self):
          if not self.params.get('consoletitle', False):
              return
 -        if 'TERM' in os.environ:
 +        if compat_os_name != 'nt' and 'TERM' in os.environ:
              # Restore the title from stack
              self._write_string('\033[23;0t', self._screen_file)
  
                      r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
                      outtmpl)
  
 -            NUMERIC_FIELDS = set((
 -                'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 -                'timestamp', 'upload_year', 'upload_month', 'upload_day',
 -                'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 -                'average_rating', 'comment_count', 'age_limit',
 -                'start_time', 'end_time',
 -                'chapter_number', 'season_number', 'episode_number',
 -                'track_number', 'disc_number', 'release_year',
 -                'playlist_index',
 -            ))
 -
              # Missing numeric fields used together with integer presentation types
              # in format specification will break the argument substitution since
              # string 'NA' is returned for missing fields. We will patch output
              # template for missing fields to meet string presentation type.
 -            for numeric_field in NUMERIC_FIELDS:
 +            for numeric_field in self._NUMERIC_FIELDS:
                  if numeric_field not in template_dict:
                      # As of [1] format syntax is:
                      #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
                          FORMAT_RE.format(numeric_field),
                          r'%({0})s'.format(numeric_field), outtmpl)
  
 -            filename = expand_path(outtmpl % template_dict)
 +            # expand_path translates '%%' into '%' and '$$' into '$'
 +            # correspondingly that is not what we want since we need to keep
 +            # '%%' intact for template dict substitution step. Working around
 +            # with boundary-alike separator hack.
 +            sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
 +            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
 +
 +            # outtmpl should be expand_path'ed before template dict substitution
 +            # because meta fields may contain env variables we don't want to
 +            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
 +            # title "Hello $PATH", we don't want `$PATH` to be expanded.
 +            filename = expand_path(outtmpl).replace(sep, '') % template_dict
 +
              # Temporary fix for #4787
              # 'Treat' all problem characters by passing filename through preferredencoding
              # to workaround encoding issues with subprocess on python2 @ Windows
  
              force_properties = dict(
                  (k, v) for k, v in ie_result.items() if v is not None)
 -            for f in ('_type', 'url', 'ie_key'):
 +            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
                  if f in force_properties:
                      del force_properties[f]
              new_result = info.copy()
              return op(actual_value, comparison_value)
          return _filter
  
 +    def _default_format_spec(self, info_dict, download=True):
 +        req_format_list = []
 +
 +        def can_have_partial_formats():
 +            if self.params.get('simulate', False):
 +                return True
 +            if not download:
 +                return True
 +            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
 +                return False
 +            if info_dict.get('is_live'):
 +                return False
 +            merger = FFmpegMergerPP(self)
 +            return merger.available and merger.can_merge()
 +        if can_have_partial_formats():
 +            req_format_list.append('bestvideo+bestaudio')
 +        req_format_list.append('best')
 +        return '/'.join(req_format_list)
 +
      def build_format_selector(self, format_spec):
          def syntax_error(note, start):
              message = (
          if 'title' not in info_dict:
              raise ExtractorError('Missing "title" field in extractor result')
  
 -        if not isinstance(info_dict['id'], compat_str):
 -            self.report_warning('"id" field is not a string - forcing string conversion')
 -            info_dict['id'] = compat_str(info_dict['id'])
 +        def report_force_conversion(field, field_not, conversion):
 +            self.report_warning(
 +                '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
 +                % (field, field_not, conversion))
 +
 +        def sanitize_string_field(info, string_field):
 +            field = info.get(string_field)
 +            if field is None or isinstance(field, compat_str):
 +                return
 +            report_force_conversion(string_field, 'a string', 'string')
 +            info[string_field] = compat_str(field)
 +
 +        def sanitize_numeric_fields(info):
 +            for numeric_field in self._NUMERIC_FIELDS:
 +                field = info.get(numeric_field)
 +                if field is None or isinstance(field, compat_numeric_types):
 +                    continue
 +                report_force_conversion(numeric_field, 'numeric', 'int')
 +                info[numeric_field] = int_or_none(field)
 +
 +        sanitize_string_field(info_dict, 'id')
 +        sanitize_numeric_fields(info_dict)
  
          if 'playlist' not in info_dict:
              # It isn't part of a playlist
          if not formats:
              raise ExtractorError('No video formats found!')
  
 +        def is_wellformed(f):
 +            url = f.get('url')
 +            if not url:
 +                self.report_warning(
 +                    '"url" field is missing or empty - skipping format, '
 +                    'there is an error in extractor')
 +                return False
 +            if isinstance(url, bytes):
 +                sanitize_string_field(f, 'url')
 +            return True
 +
 +        # Filter out malformed formats for better extraction robustness
 +        formats = list(filter(is_wellformed, formats))
 +
          formats_dict = {}
  
          # We check that all the formats have the format and format_id fields
          for i, format in enumerate(formats):
 -            if 'url' not in format:
 -                raise ExtractorError('Missing "url" key in result (index %d)' % i)
 -
 +            sanitize_string_field(format, 'format_id')
 +            sanitize_numeric_fields(format)
              format['url'] = sanitize_url(format['url'])
 -
 -            if format.get('format_id') is None:
 +            if not format.get('format_id'):
                  format['format_id'] = compat_str(i)
              else:
                  # Sanitize format_id from characters used in format selector expression
  
          req_format = self.params.get('format')
          if req_format is None:
 -            req_format_list = []
 -            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
 -                    not info_dict.get('is_live')):
 -                merger = FFmpegMergerPP(self)
 -                if merger.available and merger.can_merge():
 -                    req_format_list.append('bestvideo+bestaudio')
 -            req_format_list.append('best')
 -            req_format = '/'.join(req_format_list)
 +            req_format = self._default_format_spec(info_dict, download=download)
 +            if self.params.get('verbose'):
 +                self.to_stdout('[debug] Default format spec: %s' % req_format)
 +
          format_selector = self.build_format_selector(req_format)
  
          # While in format selection we may need to have an access to the original
          if filename is None:
              return
  
 -        try:
 -            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
 -            if dn and not os.path.exists(dn):
 -                os.makedirs(dn)
 -        except (OSError, IOError) as err:
 -            self.report_error('unable to create directory ' + error_to_compat_str(err))
 +        def ensure_dir_exists(path):
 +            try:
 +                dn = os.path.dirname(path)
 +                if dn and not os.path.exists(dn):
 +                    os.makedirs(dn)
 +                return True
 +            except (OSError, IOError) as err:
 +                self.report_error('unable to create directory ' + error_to_compat_str(err))
 +                return False
 +
 +        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
              return
  
          if self.params.get('writedescription', False):
              ie = self.get_info_extractor(info_dict['extractor_key'])
              for sub_lang, sub_info in subtitles.items():
                  sub_format = sub_info['ext']
 -                if sub_info.get('data') is not None:
 -                    sub_data = sub_info['data']
 +                sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 +                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 +                    self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
                  else:
 -                    try:
 -                        sub_data = ie._download_webpage(
 -                            sub_info['url'], info_dict['id'], note=False)
 -                    except ExtractorError as err:
 -                        self.report_warning('Unable to download subtitle for "%s": %s' %
 -                                            (sub_lang, error_to_compat_str(err.cause)))
 -                        continue
 -                try:
 -                    sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 -                    if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 -                        self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 +                    self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
 +                    if sub_info.get('data') is not None:
 +                        try:
 +                            # Use newline='' to prevent conversion of newline characters
 +                            # See https://github.com/rg3/youtube-dl/issues/10268
 +                            with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
 +                                subfile.write(sub_info['data'])
 +                        except (OSError, IOError):
 +                            self.report_error('Cannot write subtitles file ' + sub_filename)
 +                            return
                      else:
 -                        self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
 -                        # Use newline='' to prevent conversion of newline characters
 -                        # See https://github.com/rg3/youtube-dl/issues/10268
 -                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
 -                            subfile.write(sub_data)
 -                except (OSError, IOError):
 -                    self.report_error('Cannot write subtitles file ' + sub_filename)
 -                    return
 +                        try:
 +                            sub_data = ie._request_webpage(
 +                                sub_info['url'], info_dict['id'], note=False).read()
 +                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
 +                                subfile.write(sub_data)
 +                        except (ExtractorError, IOError, OSError, ValueError) as err:
 +                            self.report_warning('Unable to download subtitle for "%s": %s' %
 +                                                (sub_lang, error_to_compat_str(err)))
 +                            continue
  
          if self.params.get('writeinfojson', False):
              infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
                          for f in requested_formats:
                              new_info = dict(info_dict)
                              new_info.update(f)
 -                            fname = self.prepare_filename(new_info)
 -                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
 +                            fname = prepend_extension(
 +                                self.prepare_filename(new_info),
 +                                'f%s' % f['format_id'], new_info['ext'])
 +                            if not ensure_dir_exists(fname):
 +                                return
                              downloaded.append(fname)
                              partial_success = dl(fname, new_info)
                              success = success and partial_success
                          info_dict.get('protocol') == 'm3u8' and
                          self.params.get('hls_prefer_native')):
                      if fixup_policy == 'warn':
 -                        self.report_warning('%s: malformated aac bitstream.' % (
 +                        self.report_warning('%s: malformed AAC bitstream detected.' % (
                              info_dict['id']))
                      elif fixup_policy == 'detect_or_warn':
                          fixup_pp = FFmpegFixupM3u8PP(self)
                              info_dict['__postprocessors'].append(fixup_pp)
                          else:
                              self.report_warning(
 -                                '%s: malformated aac bitstream. %s'
 +                                '%s: malformed AAC bitstream detected. %s'
                                  % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
                      else:
                          assert fixup_policy in ('ignore', 'never')
  
          exe_versions = FFmpegPostProcessor.get_versions(self)
          exe_versions['rtmpdump'] = rtmpdump_version()
+         exe_versions['phantomjs'] = PhantomJSwrapper._version()
          exe_str = ', '.join(
              '%s %s' % (exe, v)
              for exe, v in sorted(exe_versions.items())
index 74d30ec50ac7f9d5db44d46bab1dd482fcc34f59,76b5378e97620137c286cf156b6e2ac790f04a8a..317a9a76fc417e9ad4455bc99b30e782849eeabc
@@@ -27,7 -27,6 +27,7 @@@ from ..compat import 
      compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
 +    compat_xml_parse_error,
  )
  from ..downloader.f4m import remove_encrypted_media
  from ..utils import (
@@@ -377,7 -376,7 +377,7 @@@ class InfoExtractor(object)
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
          m = cls._VALID_URL_RE.match(url)
          assert m
 -        return m.group('id')
 +        return compat_str(m.group('id'))
  
      @classmethod
      def working(cls):
              if country_code:
                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                  if self._downloader.params.get('verbose', False):
 -                    self._downloader.to_stdout(
 +                    self._downloader.to_screen(
                          '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
                          % (self._x_forwarded_for_ip, country_code.upper()))
  
  
      def _download_xml(self, url_or_request, video_id,
                        note='Downloading XML', errnote='Unable to download XML',
 -                      transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 +                      transform_source=None, fatal=True, encoding=None,
 +                      data=None, headers={}, query={}):
          """Return the xml as an xml.etree.ElementTree.Element"""
          xml_string = self._download_webpage(
 -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 +            url_or_request, video_id, note, errnote, fatal=fatal,
 +            encoding=encoding, data=data, headers=headers, query=query)
          if xml_string is False:
              return xml_string
 +        return self._parse_xml(
 +            xml_string, video_id, transform_source=transform_source,
 +            fatal=fatal)
 +
 +    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
          if transform_source:
              xml_string = transform_source(xml_string)
 -        return compat_etree_fromstring(xml_string.encode('utf-8'))
 +        try:
 +            return compat_etree_fromstring(xml_string.encode('utf-8'))
 +        except compat_xml_parse_error as ve:
 +            errmsg = '%s: Failed to parse XML ' % video_id
 +            if fatal:
 +                raise ExtractorError(errmsg, cause=ve)
 +            else:
 +                self.report_warning(errmsg + str(ve))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
              video_info['title'] = video_title
          return video_info
  
 -    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
 -        urlrs = orderedSet(
 +    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 +        urls = orderedSet(
              self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
              for m in matches)
          return self.playlist_result(
 -            urlrs, playlist_id=video_id, playlist_title=video_title)
 +            urls, playlist_id=playlist_id, playlist_title=playlist_title)
  
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
  
      def _family_friendly_search(self, html):
          # See http://schema.org/VideoObject
 -        family_friendly = self._html_search_meta('isFamilyFriendly', html)
 +        family_friendly = self._html_search_meta(
 +            'isFamilyFriendly', html, default=None)
  
          if not family_friendly:
              return None
                  item_type = e.get('@type')
                  if expected_type is not None and expected_type != item_type:
                      return info
 -                if item_type == 'TVEpisode':
 +                if item_type in ('TVEpisode', 'Episode'):
                      info.update({
                          'episode': unescapeHTML(e.get('name')),
                          'episode_number': int_or_none(e.get('episodeNumber')),
                          'description': unescapeHTML(e.get('description')),
                      })
                      part_of_season = e.get('partOfSeason')
 -                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 +                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
                          info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 -                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 +                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
                          info['series'] = unescapeHTML(part_of_series.get('name'))
                  elif item_type == 'Article':
                      info.update({
                      })
                  elif item_type == 'VideoObject':
                      extract_video_object(e)
 -                elif item_type == 'WebPage':
 -                    video = e.get('video')
 -                    if isinstance(video, dict) and video.get('@type') == 'VideoObject':
 -                        extract_video_object(video)
 +                    continue
 +                video = e.get('video')
 +                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
 +                    extract_video_object(video)
                  break
          return dict((k, v) for k, v in info.items() if v is not None)
  
                      ms_info['timescale'] = int(timescale)
                  segment_duration = source.get('duration')
                  if segment_duration:
 -                    ms_info['segment_duration'] = int(segment_duration)
 +                    ms_info['segment_duration'] = float(segment_duration)
  
              def extract_Initialization(source):
                  initialization = source.find(_add_ns('Initialization'))
                                  'Bandwidth': bandwidth,
                              }
  
 +                        def location_key(location):
 +                            return 'url' if re.match(r'^https?://', location) else 'path'
 +
                          if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
  
                              media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
 +                            media_location_key = location_key(media_template)
  
                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                              # can't be used at the same time
                                      segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                      representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                                  representation_ms_info['fragments'] = [{
 -                                    'url': media_template % {
 +                                    media_location_key: media_template % {
                                          'Number': segment_number,
                                          'Bandwidth': bandwidth,
                                      },
                                          'Number': segment_number,
                                      }
                                      representation_ms_info['fragments'].append({
 -                                        'url': segment_url,
 +                                        media_location_key: segment_url,
                                          'duration': float_or_none(segment_d, representation_ms_info['timescale']),
                                      })
  
                              for s in representation_ms_info['s']:
                                  duration = float_or_none(s['d'], timescale)
                                  for r in range(s.get('r', 0) + 1):
 +                                    segment_uri = representation_ms_info['segment_urls'][segment_index]
                                      fragments.append({
 -                                        'url': representation_ms_info['segment_urls'][segment_index],
 +                                        location_key(segment_uri): segment_uri,
                                          'duration': duration,
                                      })
                                      segment_index += 1
                          # No fragments key is present in this case.
                          if 'fragments' in representation_ms_info:
                              f.update({
 +                                'fragment_base_url': base_url,
                                  'fragments': [],
                                  'protocol': 'http_dash_segments',
                              })
                                  initialization_url = representation_ms_info['initialization_url']
                                  if not f.get('url'):
                                      f['url'] = initialization_url
 -                                f['fragments'].append({'url': initialization_url})
 +                                f['fragments'].append({location_key(initialization_url): initialization_url})
                              f['fragments'].extend(representation_ms_info['fragments'])
 -                            for fragment in f['fragments']:
 -                                fragment['url'] = urljoin(base_url, fragment['url'])
                          try:
                              existing_format = next(
                                  fo for fo in formats
              compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
  
      def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
 +        """
 +        Parse formats from ISM manifest.
 +        References:
 +         1. [MS-SSTR]: Smooth Streaming Protocol,
 +            https://msdn.microsoft.com/en-us/library/ff469518.aspx
 +        """
          if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
              return []
  
                      self.report_warning('%s is not a supported codec' % fourcc)
                      continue
                  tbr = int(track.attrib['Bitrate']) // 1000
 -                width = int_or_none(track.get('MaxWidth'))
 -                height = int_or_none(track.get('MaxHeight'))
 +                # [1] does not mention Width and Height attributes. However,
 +                # they're often present while MaxWidth and MaxHeight are
 +                # missing, so should be used as fallbacks
 +                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
 +                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
                  sampling_rate = int_or_none(track.get('SamplingRate'))
  
                  track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
                  return f
              return {}
  
 -        def _media_formats(src, cur_media_type):
 +        def _media_formats(src, cur_media_type, type_info={}):
              full_url = absolute_url(src)
 -            ext = determine_ext(full_url)
 +            ext = type_info.get('ext') or determine_ext(full_url)
              if ext == 'm3u8':
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
 -                    preference=preference)
 +                    preference=preference, fatal=False)
              elif ext == 'mpd':
                  is_plain_url = False
                  formats = self._extract_mpd_formats(
 -                    full_url, video_id, mpd_id=mpd_id)
 +                    full_url, video_id, mpd_id=mpd_id, fatal=False)
              else:
                  is_plain_url = True
                  formats = [{
              return is_plain_url, formats
  
          entries = []
 +        # amp-video and amp-audio are very similar to their HTML5 counterparts
 +        # so we wll include them right here (see
 +        # https://www.ampproject.org/docs/reference/components/amp-video)
          media_tags = [(media_tag, media_type, '')
                        for media_tag, media_type
 -                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
 +                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
          media_tags.extend(re.findall(
              # We only allow video|audio followed by a whitespace or '>'.
              # Allowing more characters may end up in significant slow down (see
              # https://github.com/rg3/youtube-dl/issues/11979, example URL:
              # http://www.porntrex.com/maps/videositemap.xml).
 -            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
 +            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
          for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                      src = source_attributes.get('src')
                      if not src:
                          continue
 -                    is_plain_url, formats = _media_formats(src, media_type)
 +                    f = parse_content_type(source_attributes.get('type'))
 +                    is_plain_url, formats = _media_formats(src, media_type, f)
                      if is_plain_url:
 -                        f = parse_content_type(source_attributes.get('type'))
 +                        # res attribute is not standard but seen several times
 +                        # in the wild
 +                        f.update({
 +                            'height': int_or_none(source_attributes.get('res')),
 +                            'format_id': source_attributes.get('label'),
 +                        })
                          f.update(formats[0])
                          media_info['formats'].append(f)
                      else:
      def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
          formats = []
          hdcore_sign = 'hdcore=3.7.0'
 -        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
 +        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
          hds_host = hosts.get('hds')
          if hds_host:
              f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
  
      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
          url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
 -        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
 -        http_base_url = 'http' + url_base
 +        url_base = self._search_regex(
 +            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
 +        http_base_url = '%s:%s' % ('http', url_base)
          formats = []
          if 'm3u8' not in skip_protocols:
              formats.extend(self._extract_m3u8_formats(
              for protocol in ('rtmp', 'rtsp'):
                  if protocol not in skip_protocols:
                      formats.append({
 -                        'url': protocol + url_base,
 +                        'url': '%s:%s' % (protocol, url_base),
                          'format_id': protocol,
                          'protocol': protocol,
                      })
              tracks = video_data.get('tracks')
              if tracks and isinstance(tracks, list):
                  for track in tracks:
 +                    if not isinstance(track, dict):
 +                        continue
                      if track.get('kind') != 'captions':
                          continue
                      track_url = urljoin(base_url, track.get('file'))
          urls = []
          formats = []
          for source in jwplayer_sources_data:
 +            if not isinstance(source, dict):
 +                continue
              source_url = self._proto_relative_url(source.get('file'))
              if not source_url:
                  continue
                  self._downloader.report_warning(msg)
          return res
  
-     def _set_cookie(self, domain, name, value, expire_time=None):
+     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
+                     path='/', secure=False, discard=False, rest={}, **kwargs):
          cookie = compat_cookiejar.Cookie(
-             0, name, value, None, None, domain, None,
-             None, '/', True, False, expire_time, '', None, None, None)
+             0, name, value, port, not port is None, domain, True,
+             domain.startswith('.'), path, True, secure, expire_time,
+             discard, None, None, rest)
          self._downloader.cookiejar.set_cookie(cookie)
  
      def _get_cookies(self, url):
diff --combined youtube_dl/utils.py
index c42dd4c3ae20d59b6646b64180720c4482a0e901,4d0685d83a3d888bdb84b067ecf1ebacdaa7ac7f..9e4492d402c225d53071d1424005ff5be0577681
@@@ -22,6 -22,7 +22,6 @@@ import local
  import math
  import operator
  import os
 -import pipes
  import platform
  import random
  import re
@@@ -35,7 -36,6 +35,7 @@@ import xml.etree.ElementTre
  import zlib
  
  from .compat import (
 +    compat_HTMLParseError,
      compat_HTMLParser,
      compat_basestring,
      compat_chr,
@@@ -365,9 -365,9 +365,9 @@@ def get_elements_by_attribute(attribute
      retlist = []
      for m in re.finditer(r'''(?xs)
          <([a-zA-Z0-9:._-]+)
 -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
           \s+%s=['"]?%s['"]?
 -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
          \s*>
          (?P<content>.*?)
          </\1>
@@@ -409,12 -409,8 +409,12 @@@ def extract_attributes(html_element)
      but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
      """
      parser = HTMLAttributeParser()
 -    parser.feed(html_element)
 -    parser.close()
 +    try:
 +        parser.feed(html_element)
 +        parser.close()
 +    # Older Python may throw HTMLParseError in case of malformed HTML
 +    except compat_HTMLParseError:
 +        pass
      return parser.attrs
  
  
@@@ -596,7 -592,7 +596,7 @@@ def unescapeHTML(s)
      assert type(s) == compat_str
  
      return re.sub(
 -        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 +        r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  
  
  def get_subprocess_encoding():
@@@ -936,6 -932,14 +936,6 @@@ class YoutubeDLHandler(compat_urllib_re
          except zlib.error:
              return zlib.decompress(data)
  
 -    @staticmethod
 -    def addinfourl_wrapper(stream, headers, url, code):
 -        if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 -            return compat_urllib_request.addinfourl(stream, headers, url, code)
 -        ret = compat_urllib_request.addinfourl(stream, headers, url)
 -        ret.code = code
 -        return ret
 -
      def http_request(self, req):
          # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
          # always respected by websites, some tend to give out URLs with non percent-encoded
                      break
                  else:
                      raise original_ioerror
 -            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 +            resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
              del resp.headers['Content-encoding']
          # deflate
          if resp.headers.get('Content-encoding', '') == 'deflate':
              gz = io.BytesIO(self.deflate(resp.read()))
 -            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 +            resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
              del resp.headers['Content-encoding']
          # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
@@@ -1183,7 -1187,7 +1183,7 @@@ def unified_timestamp(date_str, day_fir
      if date_str is None:
          return None
  
 -    date_str = date_str.replace(',', ' ')
 +    date_str = re.sub(r'[,|]', '', date_str)
  
      pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
      timezone, date_str = extract_timezone(date_str)
@@@ -1534,7 -1538,7 +1534,7 @@@ def shell_quote(args)
          if isinstance(a, bytes):
              # We may get a filename encoded with 'encodeFilename'
              a = a.decode(encoding)
 -        quoted_args.append(pipes.quote(a))
 +        quoted_args.append(compat_shlex_quote(a))
      return ' '.join(quoted_args)
  
  
@@@ -1815,10 -1819,6 +1815,10 @@@ def float_or_none(v, scale=1, invscale=
          return default
  
  
 +def bool_or_none(v, default=None):
 +    return v if isinstance(v, bool) else default
 +
 +
  def strip_or_none(v):
      return None if v is None else v.strip()
  
@@@ -2098,7 -2098,7 +2098,7 @@@ def update_Request(req, url=None, data=
      return new_req
  
  
 -def try_multipart_encode(data, boundary):
 +def _multipart_encode_impl(data, boundary):
      content_type = 'multipart/form-data; boundary=%s' % boundary
  
      out = b''
              v = v.encode('utf-8')
          # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
          # suggests sending UTF-8 directly. Firefox sends UTF-8, too
 -        content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n'
 +        content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
          if boundary.encode('ascii') in content:
              raise ValueError('Boundary overlaps with data')
          out += content
@@@ -2140,7 -2140,7 +2140,7 @@@ def multipart_encode(data, boundary=Non
              boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  
          try:
 -            out, content_type = try_multipart_encode(data, boundary)
 +            out, content_type = _multipart_encode_impl(data, boundary)
              break
          except ValueError:
              if has_specified_boundary:
@@@ -2211,12 -2211,7 +2211,12 @@@ def parse_age_limit(s)
  
  def strip_jsonp(code):
      return re.sub(
 -        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
 +        r'''(?sx)^
 +            (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
 +            (?:\s*&&\s*(?P=func_name))?
 +            \s*\(\s*(?P<callback_data>.*)\);?
 +            \s*?(?://[^\n]*)*$''',
 +        r'\g<callback_data>', code)
  
  
  def js_to_json(code):
@@@ -2365,11 -2360,11 +2365,11 @@@ def parse_codecs(codecs_str)
          if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
              if not vcodec:
                  vcodec = full_codec
 -        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
 +        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
              if not acodec:
                  acodec = full_codec
          else:
 -            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
 +            write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
      if not vcodec and not acodec:
          if len(splited_codecs) == 2:
              return {
@@@ -2737,8 -2732,6 +2737,8 @@@ def cli_option(params, command_option, 
  
  def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
      param = params.get(param)
 +    if param is None:
 +        return []
      assert isinstance(param, bool)
      if separator:
          return [command_option + separator + (true_value if param else false_value)]
@@@ -3822,6 -3815,219 +3822,219 @@@ def write_xattr(path, key, value)
                          "or the 'xattr' binary.")
  
  
+ def cookie_to_dict(cookie):
+     cookie_dict = {
+         'name': cookie.name,
+         'value': cookie.value,
+     };
+     if cookie.port_specified:
+         cookie_dict['port'] = cookie.port
+     if cookie.domain_specified:
+         cookie_dict['domain'] = cookie.domain
+     if cookie.path_specified:
+         cookie_dict['path'] = cookie.path
+     if not cookie.expires is None:
+         cookie_dict['expires'] = cookie.expires
+     if not cookie.secure is None:
+         cookie_dict['secure'] = cookie.secure
+     if not cookie.discard is None:
+         cookie_dict['discard'] = cookie.discard
+     try:
+         if (cookie.has_nonstandard_attr('httpOnly') or
+             cookie.has_nonstandard_attr('httponly') or
+             cookie.has_nonstandard_attr('HttpOnly')):
+             cookie_dict['httponly'] = True
+     except TypeError:
+         pass
+     return cookie_dict
+ def cookie_jar_to_list(cookie_jar):
+     return [cookie_to_dict(cookie) for cookie in cookie_jar]
+ class PhantomJSwrapper(object):
+     """PhantomJS wrapper class"""
+     _TEMPLATE = r'''
+         phantom.onError = function(msg, trace) {{
+           var msgStack = ['PHANTOM ERROR: ' + msg];
+           if(trace && trace.length) {{
+             msgStack.push('TRACE:');
+             trace.forEach(function(t) {{
+               msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+                 + (t.function ? ' (in function ' + t.function +')' : ''));
+             }});
+           }}
+           console.error(msgStack.join('\n'));
+           phantom.exit(1);
+         }};
+         var page = require('webpage').create();
+         var fs = require('fs');
+         var read = {{ mode: 'r', charset: 'utf-8' }};
+         var write = {{ mode: 'w', charset: 'utf-8' }};
+         JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
+           phantom.addCookie(x);
+         }});
+         page.settings.resourceTimeout = {timeout};
+         page.settings.userAgent = "{ua}";
+         page.onLoadStarted = function() {{
+           page.evaluate(function() {{
+             delete window._phantom;
+             delete window.callPhantom;
+           }});
+         }};
+         var saveAndExit = function() {{
+           fs.write("{html}", page.content, write);
+           fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
+           phantom.exit();
+         }};
+         page.onLoadFinished = function(status) {{
+           if(page.url === "") {{
+             page.setContent(fs.read("{html}", read), "{url}");
+           }}
+           else {{
+             {jscode}
+           }}
+         }};
+         page.open("");
+     '''
+     _TMP_FILE_NAMES = ['script', 'html', 'cookies']
+     @staticmethod
+     def _version():
+         return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
+     def __init__(self, extractor, required_version=None, timeout=10000):
+         self.exe = check_executable('phantomjs', ['-v'])
+         if not self.exe:
+             raise ExtractorError('PhantomJS executable not found in PATH, '
+                                  'download it from http://phantomjs.org',
+                                  expected=True)
+         self.extractor = extractor
+         if required_version:
+             version = self._version()
+             if is_outdated_version(version, required_version):
+                 self.extractor._downloader.report_warning(
+                     'Your copy of PhantomJS is outdated, update it to version '
+                     '%s or newer if you encounter any errors.' % required_version)
+         self.options = {
+             'timeout': timeout,
+         }
+         self._TMP_FILES = {}
+         for name in self._TMP_FILE_NAMES:
+             tmp = tempfile.NamedTemporaryFile(delete=False)
+             tmp.close()
+             self._TMP_FILES[name] = tmp
+     def __del__(self):
+         for name in self._TMP_FILE_NAMES:
+             try:
+                 os.remove(self._TMP_FILES[name].name)
+             except:
+                 pass
+     def _save_cookies(self, url):
+         cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
+         for cookie in cookies:
+             if 'path' not in cookie:
+                 cookie['path'] = '/'
+             if 'domain' not in cookie:
+                 cookie['domain'] = compat_urlparse.urlparse(url).netloc
+         with open(self._TMP_FILES['cookies'].name, 'wb') as f:
+             f.write(json.dumps(cookies).encode('utf-8'))
+     def _load_cookies(self):
+         with open(self._TMP_FILES['cookies'].name, 'rb') as f:
+             cookies = json.loads(f.read().decode('utf-8'))
+         for cookie in cookies:
+             if cookie['httponly'] is True:
+                 cookie['rest'] = { 'httpOnly': None }
+             if 'expiry' in cookie:
+                 cookie['expire_time'] = cookie['expiry']
+             self.extractor._set_cookie(**cookie)
+     def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
+         """
+         Downloads webpage (if needed) and executes JS
+         
+         Params:
+             url: website url
+             html: optional, html code of website
+             video_id: video id
+             note: optional, displayed when downloading webpage
+             note2: optional, displayed when executing JS
+             headers: custom http headers
+             jscode: code to be executed when page is loaded
+         
+         Returns tuple with:
+             * downloaded website (after JS execution)
+             * anything you print with `console.log` (but not inside `page.execute`!)
+         
+         In most cases you don't need to add any `jscode`.
+         It is executed in `page.onLoadFinished`.
+         `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
+         It is possible to wait for some element on the webpage, for example:
+             var check = function() {
+               var elementFound = page.evaluate(function() {
+                 return document.querySelector('#b.done') !== null;
+               });
+               if(elementFound)
+                 saveAndExit();
+               else
+                 window.setTimeout(check, 500);
+             }
+             
+             page.evaluate(function(){
+               document.querySelector('#a').click();
+             });
+             check();
+         """
+         if 'saveAndExit();' not in jscode:
+             raise ExtractorError('`saveAndExit();` not found in `jscode`')
+         if not html:
+             html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
+         with open(self._TMP_FILES['html'].name, 'wb') as f:
+             f.write(html.encode('utf-8'))
+         self._save_cookies(url)
+         replaces = self.options
+         replaces['url'] = url
+         user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+         replaces['ua'] = user_agent.replace('"', '\\"')
+         replaces['jscode'] = jscode
+         for x in self._TMP_FILE_NAMES:
+             replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
+         with open(self._TMP_FILES['script'].name, 'wb') as f:
+             f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
+         if video_id is None:
+             self.extractor.to_screen('%s' % (note2,))
+         else:
+             self.extractor.to_screen('%s: %s' % (video_id, note2))
+         p = subprocess.Popen([self.exe, '--ssl-protocol=any',
+             self._TMP_FILES['script'].name], stdout=subprocess.PIPE,
+             stderr=subprocess.PIPE)
+         out, err = p.communicate()
+         if p.returncode != 0:
+             raise ExtractorError('Executing JS failed\n:'
+                                  + encodeArgument(err))
+         with open(self._TMP_FILES['html'].name, 'rb') as f:
+             html = f.read().decode('utf-8')
+         self._load_cookies()
+         return (html, encodeArgument(out))
  def random_birthday(year_field, month_field, day_field):
      return {
          year_field: str(random.randint(1950, 1995)),