Merge pull request #7045 from remitamine/ign
[youtube-dl] / youtube_dl / downloader / f4m.py
index 275564b5976b9d28a7d67f839c81467029aa5c18..aaf0c49c8cb474e397a71988502848ed8351336b 100644 (file)
@@ -5,14 +5,18 @@ import io
 import itertools
 import os
 import time
-import xml.etree.ElementTree as etree
 
 from .fragment import FragmentFD
 from ..compat import (
+    compat_etree_fromstring,
     compat_urlparse,
     compat_urllib_error,
+    compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    encodeFilename,
+    fix_xml_ampersands,
+    sanitize_open,
     struct_pack,
     struct_unpack,
     xpath_text,
@@ -283,9 +287,14 @@ class F4mFD(FragmentFD):
         man_url = info_dict['url']
         requested_bitrate = info_dict.get('tbr')
         self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
-        manifest = self.ydl.urlopen(man_url).read()
-
-        doc = etree.fromstring(manifest)
+        urlh = self.ydl.urlopen(man_url)
+        man_url = urlh.geturl()
+        # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+        # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244
+        # and https://github.com/rg3/youtube-dl/issues/7823)
+        manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()
+
+        doc = compat_etree_fromstring(manifest)
         formats = [(int(f.attrib.get('bitrate', -1)), f)
                    for f in self._get_unencrypted_media(doc)]
         if requested_bitrate is None:
@@ -327,34 +336,40 @@ class F4mFD(FragmentFD):
         if not live:
             write_metadata_tag(dest_stream, metadata)
 
+        base_url_parsed = compat_urllib_parse_urlparse(base_url)
+
         self._start_frag_download(ctx)
 
         frags_filenames = []
         while fragments_list:
             seg_i, frag_i = fragments_list.pop(0)
             name = 'Seg%d-Frag%d' % (seg_i, frag_i)
-            url = base_url + name
+            query = []
+            if base_url_parsed.query:
+                query.append(base_url_parsed.query)
             if akamai_pv:
-                url += '?' + akamai_pv.strip(';')
+                query.append(akamai_pv.strip(';'))
             if info_dict.get('extra_param_to_segment_url'):
-                url += info_dict.get('extra_param_to_segment_url')
+                query.append(info_dict['extra_param_to_segment_url'])
+            url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
             frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
             try:
-                success = ctx['dl'].download(frag_filename, {'url': url})
+                success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()})
                 if not success:
                     return False
-                with open(frag_filename, 'rb') as down:
-                    down_data = down.read()
-                    reader = FlvReader(down_data)
-                    while True:
-                        _, box_type, box_data = reader.read_box_info()
-                        if box_type == b'mdat':
-                            dest_stream.write(box_data)
-                            break
+                (down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
+                down_data = down.read()
+                down.close()
+                reader = FlvReader(down_data)
+                while True:
+                    _, box_type, box_data = reader.read_box_info()
+                    if box_type == b'mdat':
+                        dest_stream.write(box_data)
+                        break
                 if live:
-                    os.remove(frag_filename)
+                    os.remove(encodeFilename(frag_sanitized))
                 else:
-                    frags_filenames.append(frag_filename)
+                    frags_filenames.append(frag_sanitized)
             except (compat_urllib_error.HTTPError, ) as err:
                 if live and (err.code == 404 or err.code == 410):
                     # We didn't keep up with the live window. Continue
@@ -375,6 +390,6 @@ class F4mFD(FragmentFD):
         self._finish_frag_download(ctx)
 
         for frag_file in frags_filenames:
-            os.remove(frag_file)
+            os.remove(encodeFilename(frag_file))
 
         return True