% Exploring how to take gradients wrt a log matrix quantity

% Just makes debugging easier:
rand('state',0);
randn('state',0);

D=3;
epsilon=wishrnd(eye(D),10)*1e-5;

X=wishrnd(eye(D),D*3);
a=randn(D,1);
b=randn(D,1);

% Y=a'*X*b;
% dY/dX = b*a'

% First check numerically that I've done normal gradient correctly.
empiricaldY=a'*(X+epsilon)*b-a'*X*b
grad=b*a';
calculusdY=sum(sum(grad.*epsilon))
theerror=empiricaldY-calculusdY

lX=logm(X);

if 0
	% move on to perturbing the log quantity.
	%epsilon=tril(epsilon);
	l_empiricaldY=a'*(expm(lX+epsilon))*b-a'*expm(lX)*b
	% The following are NOT how to do it:
	%l_calculusdY=sum(sum((X*b*a').*epsilon))
	%l_calculusdY=sum(sum((b*a'*X).*epsilon))

	% Note: dX/dlogX or de^Z/dZ is a 4-dimensional quanity. Doing this properly
	% starts at the sumation involving that.

	% If X were an unconstrained matrix AND X and its gradient commute then the
	% gradient would be:
	grad = grad'*X;
	% But we know X is symmetric, so...
	% See the section on the symmetry operator in
	% Najfeld I, Havel TF:
	% Derivatives of the Matrix Exponential and Their Computation
	% Advances in Applied Mathematics 16, 321-375 (1995).
	grad=grad+grad'-diag(diag(grad));

	% And we also know that the gradient and X do NOT commute so we won't get quite
	% the right answer.
	l_calculusdY=sum(sum(grad.*epsilon))
	l_theerror=l_empiricaldY-l_calculusdY

	% It seems that in general the commutator between the gradient and X could be
	% arbitrary and things might be pretty hard work.
end

% Try (again) to do it properly:
%a=b % TODO: when all works will only have transpose errors left
l_empiricaldY=a'*(expm(lX+epsilon))*b-a'*expm(lX)*b
A=b*a';
[E,L]=eig(lX);
L=diag(L);
[a,b]=meshgrid(L,L);
ix=find(~eye(D));
T1=zeros(D);
T1(ix)= a(ix)./(b(ix).*(b(ix)-a(ix))).*(exp(b(ix))-1-b(ix)-0.5*b(ix).^2)...
       +b(ix)./(a(ix).*(a(ix)-b(ix))).*(exp(a(ix))-1-a(ix)-0.5*a(ix).^2);
ix=find(eye(D));
T1(ix)=(exp(L)-1-L)-2*(exp(L)-1-L-L.^2/2)./L;

G1=E*((E'*(A'*E)).*T1)*E';
%G1=0;
%for i=3:50
%	for j=1:(i-2)
%		G1=G1+lX^j*A'*lX^(i-1-j)/factorial(i);
%	end
%end
%G1

T2=(exp(L)-1-L)./L; % used for both G2 and G3

G2=E*((E'*A').*repmat(T2,1,D));
%G2=0;
%for i=2:50
%	G2=G2+lX^(i-1)*A'/factorial(i);
%end
%G2

G3=(A'*E)*(E.*repmat(T2',D,1))';
%G3=0;
%for i=2:100
%	G3=G3+A'*lX^(i-1)/factorial(i);
%end
%G3

grad=G1'+G2'+G3'+A';
%grad=0;
%for i=1:200
%	for j=0:i
%		grad=grad+lX^j*A'*lX^(i-j)/factorial(i+1);
%	end
%end
%grad=grad'+A'

grad2=grad+grad'-diag(diag(grad)); % grad is not the correct gradient if constrained to be symmetric
l_calculusdY=sum(sum(grad.*epsilon))
l_calculusdY2=sum(sum(grad2.*tril(epsilon))) % but interpret grad2 correctly if use it
l_theerror=l_empiricaldY-l_calculusdY
l_theerror=l_empiricaldY-l_calculusdY2