Merge pull request #7769 from remitamine/sort

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index dc508050413c8490882323e01d91ea3a3ba88c9a..34a28c126e068aa55f072cc011016c63d945a409 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,18 +10,17 @@ import re
  import socket
  import sys
  import time
-import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
-    compat_HTTPError,
+    compat_getpass,
      compat_http_client,
      compat_urllib_error,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
+    compat_urllib_parse,
      compat_urlparse,
      compat_str,
+    compat_etree_fromstring,
  )
  from ..utils import (
      NO_DEFAULT,
@@ -30,13 +29,20 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
+    error_to_compat_str,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
+    sanitized_Request,
      unescapeHTML,
+    unified_strdate,
+    url_basename,
+    xpath_text,
+    xpath_with_ns,
+    determine_protocol,
  )
  
  
@@ -147,6 +153,7 @@ class InfoExtractor(object):
      description:    Full video description.
      uploader:       Full name of the video uploader.
      creator:        The main artist who created the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
                      If not explicitly set, calculated from timestamp.
@@ -158,12 +165,14 @@ class InfoExtractor(object):
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
      average_rating: Average rating give by users, the scale used depends on the webpage
      comment_count:  Number of comments on the video
      comments:       A list of comments, each with one or more of the following
@@ -200,8 +209,8 @@ class InfoExtractor(object):
      There must be a key "entries", which is a list, an iterable, or a PagedList
      object, each element of which is a valid dictionary by this specification.
  
-    Additionally, playlists can have "title" and "id" attributes with the same
-    semantics as videos (see above).
+    Additionally, playlists can have "title", "description" and "id" attributes
+    with the same semantics as videos (see above).
  
  
      _type "multi_video" indicates that there are multiple videos that
@@ -302,11 +311,11 @@ class InfoExtractor(object):
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
-        return cls.__name__[:-2]
+        return compat_str(cls.__name__[:-2])
  
      @property
      def IE_NAME(self):
-        return type(self).__name__[:-2]
+        return compat_str(type(self).__name__[:-2])
  
      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns the response handle """
@@ -324,7 +333,8 @@ class InfoExtractor(object):
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
-            errmsg = '%s: %s' % (errnote, compat_str(err))
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
@@ -453,7 +463,7 @@ class InfoExtractor(object):
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
@@ -505,6 +515,18 @@ class InfoExtractor(object):
          """Report attempt to log in."""
          self.to_screen('Logging in')
  
+    @staticmethod
+    def raise_login_required(msg='This video is only available for registered users'):
+        raise ExtractorError(
+            '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+            expected=True)
+
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+        raise ExtractorError(
+            '%s. You might want to use --proxy to workaround.' % msg,
+            expected=True)
+
      # Methods for following #608
      @staticmethod
      def url_result(url, ie=None, video_id=None, video_title=None):
@@ -602,11 +624,11 @@ class InfoExtractor(object):
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
  
          return (username, password)
  
-    def _get_tfa_info(self):
+    def _get_tfa_info(self, note='two-factor verification code'):
          """
          Get the two-factor authentication info
          TODO - asking the user will be required for sms/phone verify
@@ -620,13 +642,14 @@ class InfoExtractor(object):
          if downloader_params.get('twofactor', None) is not None:
              return downloader_params['twofactor']
  
-        return None
+        return compat_getpass('Type %s and press [Return]: ' % note)
  
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
-        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
@@ -636,7 +659,7 @@ class InfoExtractor(object):
      @staticmethod
      def _meta_regex(prop):
          return r'''(?isx)<meta
-                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  
      def _og_search_property(self, prop, html, name=None, **kargs):
@@ -720,20 +743,23 @@ class InfoExtractor(object):
  
      @staticmethod
      def _hidden_inputs(html):
-        return dict([
-            (input.group('name'), input.group('value')) for input in re.finditer(
-                r'''(?x)
-                    <input\s+
-                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
-                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
-                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
-                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
-                ''', html)
-        ])
+        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+        hidden_inputs = {}
+        for input in re.findall(r'(?i)<input([^>]+)>', html):
+            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+                continue
+            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            if not name:
+                continue
+            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+            if not value:
+                continue
+            hidden_inputs[name.group('value')] = value.group('value')
+        return hidden_inputs
  
      def _form_hidden_inputs(self, form_id, html):
          form = self._search_regex(
-            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
              html, '%s form' % form_id, group='form')
          return self._hidden_inputs(form)
  
@@ -752,14 +778,12 @@ class InfoExtractor(object):
  
              preference = f.get('preference')
              if preference is None:
-                proto = f.get('protocol')
-                if proto is None:
-                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
-                preference = 0 if proto in ['http', 'https'] else -0.1
+                preference = 0
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
+            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -790,6 +814,7 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
@@ -817,7 +842,7 @@ class InfoExtractor(object):
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError):
+            if isinstance(e.cause, compat_urllib_error.URLError):
                  self.to_screen(
                      '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
@@ -848,13 +873,18 @@ class InfoExtractor(object):
          time.sleep(timeout)
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
-                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
-            transform_source=transform_source)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return []
  
          formats = []
          manifest_version = '1.0'
@@ -862,6 +892,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -869,13 +904,16 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    f4m_formats = self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
+                    if f4m_formats:
+                        formats.extend(f4m_formats)
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -911,13 +949,15 @@ class InfoExtractor(object):
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
-        m3u8_doc = self._download_webpage(
+        res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
-        if m3u8_doc is False:
-            return m3u8_doc
+        if res is False:
+            return []
+        m3u8_doc, urlh = res
+        m3u8_url = urlh.geturl()
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -978,69 +1018,237 @@ class InfoExtractor(object):
          self._sort_formats(formats)
          return formats
  
-    # TODO: improve extraction
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-        smil = self._download_xml(
-            smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+    @staticmethod
+    def _xpath_ns(path, namespace=None):
+        if not namespace:
+            return path
+        out = []
+        for c in path.split('/'):
+            if not c or c == '.':
+                out.append(c)
+            else:
+                out.append('{%s}%s' % (namespace, c))
+        return '/'.join(out)
+
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
          if smil is False:
              assert not fatal
              return []
  
-        base = smil.find('./head/meta').get('base')
+        namespace = self._parse_smil_namespace(smil)
+
+        return self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+        if smil is False:
+            return {}
+        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+    def _download_smil(self, smil_url, video_id, fatal=True):
+        return self._download_xml(
+            smil_url, video_id, 'Downloading SMIL file',
+            'Unable to download SMIL file', fatal=fatal)
+
+    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+        namespace = self._parse_smil_namespace(smil)
+
+        formats = self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+        video_id = os.path.splitext(url_basename(smil_url))[0]
+        title = None
+        description = None
+        upload_date = None
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            name = meta.attrib.get('name')
+            content = meta.attrib.get('content')
+            if not name or not content:
+                continue
+            if not title and name == 'title':
+                title = content
+            elif not description and name in ('description', 'abstract'):
+                description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+        return {
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _parse_smil_namespace(self, smil):
+        return self._search_regex(
+            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        base = smil_url
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            b = meta.get('base') or meta.get('httpBase')
+            if b:
+                base = b
+                break
  
          formats = []
          rtmp_count = 0
-        if smil.findall('./body/seq/video'):
-            video = smil.findall('./body/seq/video')[0]
-            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-            formats.extend(fmts)
-        else:
-            for video in smil.findall('./body/switch/video'):
-                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-                formats.extend(fmts)
+        http_count = 0
+
+        videos = smil.findall(self._xpath_ns('.//video', namespace))
+        for video in videos:
+            src = video.get('src')
+            if not src:
+                continue
+
+            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            filesize = int_or_none(video.get('size') or video.get('fileSize'))
+            width = int_or_none(video.get('width'))
+            height = int_or_none(video.get('height'))
+            proto = video.get('proto')
+            ext = video.get('ext')
+            src_ext = determine_ext(src)
+            streamer = video.get('streamer') or base
+
+            if proto == 'rtmp' or streamer.startswith('rtmp'):
+                rtmp_count += 1
+                formats.append({
+                    'url': streamer,
+                    'play_path': src,
+                    'ext': 'flv',
+                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                if transform_rtmp_url:
+                    streamer, src = transform_rtmp_url(streamer, src)
+                    formats[-1].update({
+                        'url': streamer,
+                        'play_path': src,
+                    })
+                continue
+
+            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+            if proto == 'm3u8' or src_ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
+                continue
+
+            if src_ext == 'f4m':
+                f4m_url = src_url
+                if not f4m_params:
+                    f4m_params = {
+                        'hdcore': '3.2.0',
+                        'plugin': 'flowplayer-3.2.0.1',
+                    }
+                f4m_url += '&' if '?' in f4m_url else '?'
+                f4m_url += compat_urllib_parse.urlencode(f4m_params)
+                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
+                continue
+
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
+                http_count += 1
+                formats.append({
+                    'url': src_url,
+                    'ext': ext or src_ext or 'flv',
+                    'format_id': 'http-%d' % (bitrate or http_count),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                continue
  
          self._sort_formats(formats)
  
          return formats
  
-    def _parse_smil_video(self, video, video_id, base, rtmp_count):
-        src = video.get('src')
-        if not src:
-            return [], rtmp_count
-        bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-        width = int_or_none(video.get('width'))
-        height = int_or_none(video.get('height'))
-        proto = video.get('proto')
-        if not proto:
-            if base:
-                if base.startswith('rtmp'):
-                    proto = 'rtmp'
-                elif base.startswith('http'):
-                    proto = 'http'
-        ext = video.get('ext')
-        if proto == 'm3u8':
-            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
-        elif proto == 'rtmp':
-            rtmp_count += 1
-            streamer = video.get('streamer') or base
-            return ([{
-                'url': streamer,
-                'play_path': src,
-                'ext': 'flv',
-                'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
-                'tbr': bitrate,
-                'width': width,
-                'height': height,
-            }], rtmp_count)
-        elif proto.startswith('http'):
-            return ([{
-                'url': base + src,
-                'ext': ext or 'flv',
-                'tbr': bitrate,
-                'width': width,
-                'height': height,
-            }], rtmp_count)
+    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        subtitles = {}
+        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+            src = textstream.get('src')
+            if not src:
+                continue
+            ext = textstream.get('ext') or determine_ext(src)
+            if not ext:
+                type_ = textstream.get('type')
+                SUBTITLES_TYPES = {
+                    'text/vtt': 'vtt',
+                    'text/srt': 'srt',
+                    'application/smptett+xml': 'tt',
+                }
+                if type_ in SUBTITLES_TYPES:
+                    ext = SUBTITLES_TYPES[type_]
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+            subtitles.setdefault(lang, []).append({
+                'url': src,
+                'ext': ext,
+            })
+        return subtitles
+
+    def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+        xspf = self._download_xml(
+            playlist_url, playlist_id, 'Downloading xpsf playlist',
+            'Unable to download xspf manifest', fatal=fatal)
+        if xspf is False:
+            return []
+        return self._parse_xspf(xspf, playlist_id)
+
+    def _parse_xspf(self, playlist, playlist_id):
+        NS_MAP = {
+            'xspf': 'http://xspf.org/ns/0/',
+            's1': 'http://static.streamone.nl/player/ns/0',
+        }
+
+        entries = []
+        for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+            title = xpath_text(
+                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+            description = xpath_text(
+                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+            thumbnail = xpath_text(
+                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+            duration = float_or_none(
+                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+            formats = [{
+                'url': location.text,
+                'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+                'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+                'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+            } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+            self._sort_formats(formats)
+
+            entries.append({
+                'id': playlist_id,
+                'title': title,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'formats': formats,
+            })
+        return entries
  
      def _live_title(self, name):
          """ Generate the title for a live video """
@@ -1078,7 +1286,7 @@ class InfoExtractor(object):
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
-        req = compat_urllib_request.Request(url)
+        req = sanitized_Request(url)
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))
  
@@ -1120,6 +1328,23 @@ class InfoExtractor(object):
      def _get_subtitles(self, *args, **kwargs):
          raise NotImplementedError("This method must be implemented by subclasses")
  
+    @staticmethod
+    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+        """ Merge subtitle items for one language. Items with duplicated URLs
+        will be dropped. """
+        list1_urls = set([item['url'] for item in subtitle_list1])
+        ret = list(subtitle_list1)
+        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+        return ret
+
+    @classmethod
+    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+        """ Merge two subtitle dictionaries, language by language. """
+        ret = dict(subtitle_dict1)
+        for lang in subtitle_dict2:
+            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+        return ret
+
      def extract_automatic_captions(self, *args, **kwargs):
          if (self._downloader.params.get('writeautomaticsub', False) or
                  self._downloader.params.get('listsubtitles')):