Merge remote-tracking branch 'dstftw/escape-non-ascii-in-urls'

author Philipp Hagemeister <phihag@phihag.de>

Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
diff --combined test/test_utils.py

index 70050d2b836aedfd7310bbee2c2f07037d70c71a,e90caed2961ba24689d135c2fc007ef565a26ae7..3efbed29dd34de570f2db4e6eb4954ec2f4b9c6e
--- 1/test/test_utils.py
--- 2/test/test_utils.py
+++ b/test/test_utils.py
@@@ -40,7 -40,8 +40,9 @@@ from youtube_dl.utils import 
       parse_iso8601,
       strip_jsonp,
       uppercase_escape,
+ +    limit_length,
+     escape_rfc3986,
+     escape_url,
   )
   
   
@@@ -287,12 -288,34 +289,41 @@@ class TestUtil(unittest.TestCase)
           self.assertEqual(uppercase_escape('aä'), 'aä')
           self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
   
+ +    def test_limit_length(self):
+ +        self.assertEqual(limit_length(None, 12), None)
+ +        self.assertEqual(limit_length('foo', 12), 'foo')
+ +        self.assertTrue(
+ +            limit_length('foo bar baz asd', 12).startswith('foo bar'))
+ +        self.assertTrue('...' in limit_length('foo bar baz asd', 12))
+ +
+     def test_escape_rfc3986(self):
+         reserved = "!*'();:@&=+$,/?#[]"
+         unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
+         self.assertEqual(escape_rfc3986(reserved), reserved)
+         self.assertEqual(escape_rfc3986(unreserved), unreserved)
+         self.assertEqual(escape_rfc3986('тест'), '%D1%82%D0%B5%D1%81%D1%82')
+         self.assertEqual(escape_rfc3986('%D1%82%D0%B5%D1%81%D1%82'), '%D1%82%D0%B5%D1%81%D1%82')
+         self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
+         self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')
+ 
+     def test_escape_url(self):
+         self.assertEqual(
+             escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
+             'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
+         )
+         self.assertEqual(
+             escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
+             'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
+         )
+         self.assertEqual(
+             escape_url('http://тест.рф/фрагмент'),
+             'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
+         )
+         self.assertEqual(
+             escape_url('http://тест.рф/абв?абв=абв#абв'),
+             'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
+         )
+         self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+ 
   if __name__ == '__main__':
       unittest.main()
diff --combined youtube_dl/utils.py

index 3ac0f1f541745b1ec34f1245574ea82387c3115d,e924b1688d1d7e61c3ef8aded9fab8d98a873394..b644f4e920bf0353658ec9920abdb0541dbaf0e2
--- 1/youtube_dl/utils.py
--- 2/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@@ -280,11 -280,6 +280,11 @@@ if sys.version_info >= (2, 7)
           return node.find(expr)
   else:
       def find_xpath_attr(node, xpath, key, val):
+ +        # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+ +        # .//node does not match if a node is a direct child of . !
+ +        if isinstance(xpath, unicode):
+ +            xpath = xpath.encode('ascii')
+ +
           for f in node.findall(xpath):
               if f.attrib.get(key) == val:
                   return f
@@@ -304,20 -299,6 +304,20 @@@ def xpath_with_ns(path, ns_map)
       return '/'.join(replaced)
   
   
+ +def xpath_text(node, xpath, name=None, fatal=False):
+ +    if sys.version_info < (2, 7):  # Crazy 2.6
+ +        xpath = xpath.encode('ascii')
+ +
+ +    n = node.find(xpath)
+ +    if n is None:
+ +        if fatal:
+ +            name = xpath if name is None else name
+ +            raise ExtractorError('Could not find XML element %s' % name)
+ +        else:
+ +            return None
+ +    return n.text
+ +
+ +
   compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
   class BaseHTMLParser(compat_html_parser.HTMLParser):
       def __init(self):
@@@ -1437,6 -1418,24 +1437,24 @@@ def uppercase_escape(s)
           lambda m: unicode_escape(m.group(0))[0],
           s)
   
+ 
+ def escape_rfc3986(s):
+     """Escape non-ASCII characters as suggested by RFC 3986"""
+     if sys.version_info < (3, 0) and isinstance(s, unicode):
+         s = s.encode('utf-8')
+     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ 
+ 
+ def escape_url(url):
+     """Escape URL as suggested by RFC 3986"""
+     url_parsed = compat_urllib_parse_urlparse(url)
+     return url_parsed._replace(
+         path=escape_rfc3986(url_parsed.path),
+         params=escape_rfc3986(url_parsed.params),
+         query=escape_rfc3986(url_parsed.query),
+         fragment=escape_rfc3986(url_parsed.fragment)
+     ).geturl()
+ 
   try:
       struct.pack(u'!I', 0)
   except TypeError:
@@@ -1571,13 -1570,3 +1589,13 @@@ except AttributeError
           if ret:
               raise subprocess.CalledProcessError(ret, p.args, output=output)
           return output
+ +
+ +
+ +def limit_length(s, length):
+ +    """ Add ellipses to overly long strings """
+ +    if s is None:
+ +        return None
+ +    ELLIPSES = '...'
+ +    if len(s) > length:
+ +        return s[:length - len(ELLIPSES)] + ELLIPSES
+ +    return s
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 15 Sep 2014 13:40:10 +0000 (15:40 +0200)
		1	2
test/test_utils.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/utils.py	patch \|	diff1 \|	diff2 \|	blob \| history