[openload] separate PhantomJS code from extractor
[youtube-dl] / youtube_dl / utils.py
index cbf7639c56393f21bd259ee30e294d1db704fb6b..94e1b07a627983639553938977c5bd7005649445 100644 (file)
@@ -39,6 +39,7 @@ from .compat import (
     compat_basestring,
     compat_chr,
     compat_etree_fromstring,
+    compat_expanduser,
     compat_html_entities,
     compat_html_entities_html5,
     compat_http_client,
@@ -473,7 +474,8 @@ def timeconvert(timestr):
 def sanitize_filename(s, restricted=False, is_id=False):
     """Sanitizes a string so it could be used as part of a filename.
     If restricted is set, use a stricter subset of allowed characters.
-    Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
+    Set is_id if this is not an arbitrary string, but an ID that should be kept
+    if possible.
     """
     def replace_insane(char):
         if restricted and char in ACCENT_CHARS:
@@ -538,6 +540,11 @@ def sanitized_Request(url, *args, **kwargs):
     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 
 
+def expand_path(s):
+    """Expand shell variables and ~"""
+    return os.path.expandvars(compat_expanduser(s))
+
+
 def orderedSet(iterable):
     """ Remove all duplicates from the input iterable """
     res = []
@@ -1747,11 +1754,16 @@ def base_url(url):
 
 
 def urljoin(base, path):
+    if isinstance(path, bytes):
+        path = path.decode('utf-8')
     if not isinstance(path, compat_str) or not path:
         return None
     if re.match(r'^(?:https?:)?//', path):
         return path
-    if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
+    if isinstance(base, bytes):
+        base = base.decode('utf-8')
+    if not isinstance(base, compat_str) or not re.match(
+            r'^(?:https?:)?//', base):
         return None
     return compat_urlparse.urljoin(base, path)
 
@@ -3291,7 +3303,7 @@ class GeoUtils(object):
         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
         addr_max = addr_min | (0xffffffff >> int(preflen))
         return compat_str(socket.inet_ntoa(
-            compat_struct_pack('!I', random.randint(addr_min, addr_max))))
+            compat_struct_pack('!L', random.randint(addr_min, addr_max))))
 
 
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
@@ -3319,6 +3331,57 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
             self, req, proxy, type)
 
 
+# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
+# released into Public Domain
+# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
+
+def long_to_bytes(n, blocksize=0):
+    """long_to_bytes(n:long, blocksize:int) : string
+    Convert a long integer to a byte string.
+
+    If optional blocksize is given and greater than zero, pad the front of the
+    byte string with binary zeros so that the length is a multiple of
+    blocksize.
+    """
+    # after much testing, this algorithm was deemed to be the fastest
+    s = b''
+    n = int(n)
+    while n > 0:
+        s = compat_struct_pack('>I', n & 0xffffffff) + s
+        n = n >> 32
+    # strip off leading zeros
+    for i in range(len(s)):
+        if s[i] != b'\000'[0]:
+            break
+    else:
+        # only happens when n == 0
+        s = b'\000'
+        i = 0
+    s = s[i:]
+    # add back some pad bytes.  this could be done more efficiently w.r.t. the
+    # de-padding being done above, but sigh...
+    if blocksize > 0 and len(s) % blocksize:
+        s = (blocksize - len(s) % blocksize) * b'\000' + s
+    return s
+
+
+def bytes_to_long(s):
+    """bytes_to_long(string) : long
+    Convert a byte string to a long integer.
+
+    This is (essentially) the inverse of long_to_bytes().
+    """
+    acc = 0
+    length = len(s)
+    if length % 4:
+        extra = (4 - length % 4)
+        s = b'\000' * extra + s
+        length = length + extra
+    for i in range(0, length, 4):
+        acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
+    return acc
+
+
 def ohdave_rsa_encrypt(data, exponent, modulus):
     '''
     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
@@ -3336,6 +3399,21 @@ def ohdave_rsa_encrypt(data, exponent, modulus):
     return '%x' % encrypted
 
 
+def pkcs1pad(data, length):
+    """
+    Padding input data with PKCS#1 scheme
+
+    @param {int[]} data        input data
+    @param {int}   length      target length
+    @returns {int[]}           padded data
+    """
+    if len(data) > length - 11:
+        raise ValueError('Input data too long for PKCS#1 padding')
+
+    pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
+    return [0, 2] + pseudo_random + [0] + data
+
+
 def encode_base_n(num, n, table=None):
     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
     if not table:
@@ -3574,3 +3652,144 @@ def write_xattr(path, key, value):
                         "Couldn't find a tool to set the xattrs. "
                         "Install either the python 'xattr' module, "
                         "or the 'xattr' binary.")
+
+
+class PhantomJSwrapper(object):
+    """PhantomJS wrapper class"""
+
+    _TEMPLATE = r'''
+        phantom.onError = function(msg, trace) {{
+          var msgStack = ['PHANTOM ERROR: ' + msg];
+          if(trace && trace.length) {{
+            msgStack.push('TRACE:');
+            trace.forEach(function(t) {{
+              msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+                + (t.function ? ' (in function ' + t.function +')' : ''));
+            }});
+          }}
+          console.error(msgStack.join('\n'));
+          phantom.exit(1);
+        }};
+        var page = require('webpage').create();
+        var fs = require('fs');
+        var read = {{ mode: 'r', charset: 'utf-8' }};
+        var write = {{ mode: 'w', charset: 'utf-8' }};
+        page.settings.resourceTimeout = {timeout};
+        page.settings.userAgent = "{ua}";
+        page.onLoadStarted = function() {{
+          page.evaluate(function() {{
+            delete window._phantom;
+            delete window.callPhantom;
+          }});
+        }};
+        var saveAndExit = function() {{
+          fs.write("{html}", page.content, write);
+          phantom.exit();
+        }};
+        page.onLoadFinished = function(status) {{
+          if(page.url === "") {{
+            page.setContent(fs.read("{html}", read), "{url}");
+          }}
+          else {{
+            {jscode}
+          }}
+        }};
+        page.open("");
+    '''
+
+    _TMP_FILE_NAMES = ['script', 'html']
+
+    def __init__(self, extractor, timeout=10000):
+        self.exe = check_executable('phantomjs', ['-v'])
+        if not self.exe:
+            raise ExtractorError('PhantomJS executable not found in PATH, '
+                                 'download it from http://phantomjs.org',
+                                 expected=True)
+        self.extractor = extractor
+        self.options = {
+            'timeout': timeout,
+        }
+        self._TMP_FILES = {}
+        for name in self._TMP_FILE_NAMES:
+            tmp = tempfile.NamedTemporaryFile(delete=False)
+            tmp.close()
+            self._TMP_FILES[name] = tmp
+
+    def __del__(self):
+        for name in self._TMP_FILE_NAMES:
+            try:
+                os.remove(self._TMP_FILES[name].name)
+            except:
+                pass
+
+    def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
+        """
+        Downloads webpage (if needed) and executes JS
+        
+        Params:
+            url: website url
+            html: optional, html code of website
+            video_id: video id
+            note: optional, displayed when downloading webpage
+            note2: optional, displayed when executing JS
+            headers: custom http headers
+            jscode: code to be executed when page is loaded
+        
+        Returns tuple with:
+            * downloaded website (after JS execution)
+            * anything you print with `console.log` (but not inside `page.execute`!)
+        
+        In most cases you don't need to add any `jscode`.
+        It is executed in `page.onLoadFinished`.
+        `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
+        It is possible to wait for some element on the webpage, for example:
+            var check = function() {
+              var elementFound = page.evaluate(function() {
+                return document.querySelector('#b.done') !== null;
+              });
+              if(elementFound)
+                saveAndExit();
+              else
+                window.setTimeout(check, 500);
+            }
+            
+            page.evaluate(function(){
+              document.querySelector('#a').click();
+            });
+            check();
+        """
+        if 'saveAndExit();' not in jscode:
+            raise ExtractorError('`saveAndExit();` not found in `jscode`')
+        if not html:
+            html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
+        with open(self._TMP_FILES['html'].name, 'wb') as f:
+            f.write(html.encode('utf-8'))
+
+        replaces = self.options
+        replaces['url'] = url
+        user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+        replaces['ua'] = user_agent.replace('"', '\\"')
+        replaces['jscode'] = jscode
+
+        for x in self._TMP_FILE_NAMES:
+            replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
+
+        with open(self._TMP_FILES['script'].name, 'wb') as f:
+            f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
+
+        if video_id is None:
+            self.extractor.to_screen('%s' % (note2,))
+        else:
+            self.extractor.to_screen('%s: %s' % (video_id, note2))
+
+        p = subprocess.Popen([self.exe, '--ssl-protocol=any',
+            self._TMP_FILES['script'].name], stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        out, err = p.communicate()
+        if p.returncode != 0:
+            raise ExtractorError('Executing JS failed\n:'
+                                 + encodeArgument(err))
+        with open(self._TMP_FILES['html'].name, 'rb') as f:
+            html = f.read().decode('utf-8')
+        return (html, encodeArgument(out))
+