# dot(A, A.T) is a common operation, for example as part of computing
# covariances or square distances. The answer is symmetric, so Matlab/Octave
# special cases the code for (A*A') to roughly halve computation time:
#     http://www.walkingrandomly.com/?p=4912
# Numpy's dot doesn't currently (2013-04-25) optimize this case, and routines
# like scipy.cov are ~2x slower than they should be for large matrices.
#
# Here is a demonstration of a fast tdot(A) = dot(A,A.T). It would be great if a
# polished version of this could make it into numpy. It would need to be more
# carefully checked, and maybe deal with singles and maybe complex numbers.
# Ideally incorporating a check in dot() itself would notice when the second
# argument is the transpose of the first.
#
# As an aside, I wonder why there isn't a nice wrapper of DSYRK with the
# existing BLAS wrappers in numpy? Why aren't wrappers for *all* the standard
# BLAS routines included in numpy?
#
# Iain Murray, April 2013. iain contactable via iainmurray.net
# http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot.py

# 2013-09-23 James Hensman tells me that the BLAS functions symm, syrk, syr2k,
# hemm, herk and her2k are now wrapped in scipy.linalg, as of scipy version 0.13
# Using those wrappers could make this code somewhat simpler, at the expense of
# depending on scipy.

# 2016-01-08 added attempt to load BLAS from multiarray. On older numpy this
# file will exist, but won't provide BLAS...


import numpy as np
import ctypes
from ctypes import byref, c_char, c_int, c_double
try:
    _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__)
except:
    _blaslib = ctypes.cdll.LoadLibrary(np.core.multiarray.__file__)
try:
    _dsyrk = _blaslib.dsyrk_
except:
    raise Exception("Numpy isn't configured to use a fast BLAS library.")

def tdot(mat, out=None):
    """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles."""
    if (mat.dtype != 'float64') or (len(mat.shape) != 2):
        return np.dot(mat, mat.T)
    nn = mat.shape[0]
    if not out:
        out = np.zeros((nn,nn))
    else:
        assert(out.dtype == 'float64')
        assert(out.shape == (nn,nn))
        # FIXME: should allow non-contiguous out, and copy output into it:
        assert(8 in out.strides)
        # zeroing needed because of dumb way I copy across triangular answer
        out[:] = 0.0

    ## Call to DSYRK from BLAS
    # If already in Fortran order (rare), and has the right sorts of strides I
    # could avoid the copy. I also thought swapping to cblas API would allow use
    # of C order. However, I tried that and had errors with large matrices:
    # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
    mat = mat.copy(order='F')
    TRANS = c_char('n')
    N = c_int(mat.shape[0])
    K = c_int(mat.shape[1])
    LDA = c_int(mat.shape[0])
    UPLO = c_char('l')
    ALPHA = c_double(1.0)
    A = mat.ctypes.data_as(ctypes.c_void_p)
    BETA = c_double(0.0)
    C = out.ctypes.data_as(ctypes.c_void_p)
    LDC = c_int(np.max(out.strides) / 8)
    _dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))

    # Copy triangular answer across (there will be a better way):
    # Wouldn't need to zero over provided out array if did this better.
    out += out.T
    out[[range(nn),range(nn)]] /= 2.0  # correct the diagonal

    return out

#def cov(m, y=None, rowvar=1, bias=0, ddof=None):
#    """drop-in replacement for scipy.cov AKA numpy.lib.function_base.cov"""
#    TODO -- could make a fast cov, using tdot


if __name__ == '__main__':
    from numpy.random import rand
    import timeit
    
    a = rand(4000, 4000)
    repeats = 1
    print(timeit.Timer("numpy.dot(a, a.T)",
            "import numpy; from __main__ import a").timeit(repeats))
    print(timeit.Timer("tdot(a)",
            "from __main__ import a, tdot").timeit(repeats))
    ans1 = np.dot(a,a.T)
    ans2 = tdot(a)
    err = np.max(np.abs(ans1 - ans2) / ans1)
    print('max error = %g' % err)