% Exploring how to take gradients wrt a log matrix quantity % Just makes debugging easier: rand('state',0); randn('state',0); D=3; epsilon=wishrnd(eye(D),10)*1e-5; X=wishrnd(eye(D),D*3); a=randn(D,1); b=randn(D,1); % Y=a'*X*b; % dY/dX = b*a' % First check numerically that I've done normal gradient correctly. empiricaldY=a'*(X+epsilon)*b-a'*X*b grad=b*a'; calculusdY=sum(sum(grad.*epsilon)) theerror=empiricaldY-calculusdY lX=logm(X); if 0 % move on to perturbing the log quantity. %epsilon=tril(epsilon); l_empiricaldY=a'*(expm(lX+epsilon))*b-a'*expm(lX)*b % The following are NOT how to do it: %l_calculusdY=sum(sum((X*b*a').*epsilon)) %l_calculusdY=sum(sum((b*a'*X).*epsilon)) % Note: dX/dlogX or de^Z/dZ is a 4-dimensional quanity. Doing this properly % starts at the sumation involving that. % If X were an unconstrained matrix AND X and its gradient commute then the % gradient would be: grad = grad'*X; % But we know X is symmetric, so... % See the section on the symmetry operator in % Najfeld I, Havel TF: % Derivatives of the Matrix Exponential and Their Computation % Advances in Applied Mathematics 16, 321-375 (1995). grad=grad+grad'-diag(diag(grad)); % And we also know that the gradient and X do NOT commute so we won't get quite % the right answer. l_calculusdY=sum(sum(grad.*epsilon)) l_theerror=l_empiricaldY-l_calculusdY % It seems that in general the commutator between the gradient and X could be % arbitrary and things might be pretty hard work. end % Try (again) to do it properly: %a=b % TODO: when all works will only have transpose errors left l_empiricaldY=a'*(expm(lX+epsilon))*b-a'*expm(lX)*b A=b*a'; [E,L]=eig(lX); L=diag(L); [a,b]=meshgrid(L,L); ix=find(~eye(D)); T1=zeros(D); T1(ix)= a(ix)./(b(ix).*(b(ix)-a(ix))).*(exp(b(ix))-1-b(ix)-0.5*b(ix).^2)... +b(ix)./(a(ix).*(a(ix)-b(ix))).*(exp(a(ix))-1-a(ix)-0.5*a(ix).^2); ix=find(eye(D)); T1(ix)=(exp(L)-1-L)-2*(exp(L)-1-L-L.^2/2)./L; G1=E*((E'*(A'*E)).*T1)*E'; %G1=0; %for i=3:50 % for j=1:(i-2) % G1=G1+lX^j*A'*lX^(i-1-j)/factorial(i); % end %end %G1 T2=(exp(L)-1-L)./L; % used for both G2 and G3 G2=E*((E'*A').*repmat(T2,1,D)); %G2=0; %for i=2:50 % G2=G2+lX^(i-1)*A'/factorial(i); %end %G2 G3=(A'*E)*(E.*repmat(T2',D,1))'; %G3=0; %for i=2:100 % G3=G3+A'*lX^(i-1)/factorial(i); %end %G3 grad=G1'+G2'+G3'+A'; %grad=0; %for i=1:200 % for j=0:i % grad=grad+lX^j*A'*lX^(i-j)/factorial(i+1); % end %end %grad=grad'+A' grad2=grad+grad'-diag(diag(grad)); % grad is not the correct gradient if constrained to be symmetric l_calculusdY=sum(sum(grad.*epsilon)) l_calculusdY2=sum(sum(grad2.*tril(epsilon))) % but interpret grad2 correctly if use it l_theerror=l_empiricaldY-l_calculusdY l_theerror=l_empiricaldY-l_calculusdY2