Commit cc6fe514 by Hafiz Imtiaz

initial commit with function and readme

parents
File added
# author
This project is primarily maintained by Hafiz Imtiaz, Graduate Student at the Department of Electrical and Computer Engineering at Rutgers, the State University of New Jersey.
## Primary author
* [Hafiz Imtiaz](https://gitlab.com/u/hafizimtiaz)
## Additional contributors
* [Anand D. Sarwate](https://gitlab.com/u/asarwate)
# CCA
CCA is a MATLAB code for canonical correlation analysis algorithm - both non-private and differentially-private. Currently it contains a function that computes CCA, PCA, DPCCA and DPPCA subspaces and performance indices. The details of the performance indices are described inside the function.
## Contact
* Hafiz Imtiaz (hafiz.imtiaz@rutgers.edu)
## Dependencies
The codes are tested on MATLAB R2016a. The helper functions are included in the function files.
## Downloading
You can download the repository at https://gitlab.com/hafizimtiaz/dp-cca.git, or using
```
$ git clone https://gitlab.com/hafizimtiaz/dp-cca.git
```
## Installation
Copy the function in your working directiory or add the directory containing the functions in MATLAB path.
## License
MIT license
## Acknowledgements
This development of this collection was supported by support from the
following sources:
* National Institutes of Health under award 1R01DA040487-01A1
Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of National Institutes of Health.
function [CCA, PCA, DPCCA, DPPCA] = myCCA(x, dx, dy, K, epsilon, delta)
% code for computing CCA, PCA, DP-CCA and DP-PCA subspaces along with
% several performance indices
% Inputs:
% x - d-by-n nsample matrix. n samples in d-dimensions. two views are dx
% and dy dimensional and d = dx + dy.
% K - target (reduced) dimension
% epsilon, delta - privacy parameters. Defaults are \epsilon = 1 and \delta = 0.01
% Outputs:
% CCA/PCA/DPCCA/DPPCA - structures containing the respective subspaces: U1 (dx-by-K) and
% U2 (dy-by-K). And the following performance scores:
% 1. CHindex - the Calinski-Harabasz index
% 2. CCA scores - score1 = norm(X_new - Y_new,'fro') => the term we want to minimize from Raman Arora paper
% score2 = trace((1/N)*(X_new*Y_new')) => the total canonical correlation
% X_new = samples from View1 after projection onto the proper CCA subspace
% Y_new = samples from View2 after projection onto the proper CCA subspace
% 3. STindex - the Silhouette index (-1 to +1, higher => better clustering has done)
% The clustering scores are based on projection on view 1.
if nargin < 5
epsilon = 1;
delta = 0.01;
end
[~,N] = size(x);
r_x = 0.05; % to avoid ill-conditioned matrices
r_y = 0.05;
% computing non-private CCA
C = (1/N) * (x*x');
C11 = C(1:dx,1:dx) + r_x * eye(dx);
C12 = C(1:dx,dx+1:end);
C21 = C(dx+1:end,1:dx);
C22 = C(dx+1:end,dx+1:end) + r_y * eye(dy);
[U1,~,~] = svd((C11\C12)*(C22\C21));
[U2,~,~] = svd((C22\C21)*(C11\C12));
CCA.U1 = U1(:,1:K);
CCA.U2 = U2(:,1:K);
x1_new = CCA.U1' * x(1:dx,:);
x2_new = CCA.U2' * x(dx+1:end,:);
[idxCCA,C_CCA] = kmeans(x1_new,2);
CCA.CHindex = myCHindex(x1_new',C_CCA',idxCCA');
sCCA = silhouette(x1_new',idxCCA');
CCA.STindex = mySTindex(sCCA,idxCCA',K);
[CCA.score(1), CCA.score(2)] = myCCAscore(x1_new,x2_new);
% computing non-private PCA
[U,~,~] = svd(C);
PCA.U = U(:,1:K);
x_new = PCA.U' * x;
[idxPCA,C_PCA] = kmeans(x_new,2);
PCA.CHindex = myCHindex(x_new',C_PCA',idxPCA);
sPCA = silhouette(x1_new',idxPCA');
PCA.STindex = mySTindex(sPCA,idxPCA',K);
% computing DP-CCA using Gaussian Mechanism (C Dwork 2014)
tau = (1/(epsilon*N)) * sqrt(2*log(1.25/delta));
temp = normrnd(0,tau,dx+dy,dx+dy);
temp2 = triu(temp);
temp3 = temp2';
temp4 = tril(temp3,-1);
E = temp2+temp4;
C_hat = C + E;
C11 = C_hat(1:dx,1:dx);
C12 = C_hat(1:dx,dx+1:end);
C21 = C_hat(dx+1:end,1:dx);
C22 = C_hat(dx+1:end,dx+1:end);
[U1,~,~] = svd((C11\C12)*(C22\C21));
[U2,~,~] = svd((C22\C21)*(C11\C12));
DPCCA.U1 = U1(:,1:K);
DPCCA.U2 = U2(:,1:K);
x1_new = DPCCA.U1'*x(1:dx,:);
x2_new = DPCCA.U2'*x(dx+1:end,:);
[idxDPCCAAG,C_DPCCAAG] = kmeans(x1_new,2);
DPCCA.CHindex = myCHindex(x1_new',C_DPCCAAG',idxDPCCAAG);
sDPCCA = silhouette(x1_new',idxDPCCAAG');
DPCCA.STindex = mySTindex(sDPCCA,idxDPCCAAG',K);
[DPCCA.score(1), DPCCA.score(2)] = myCCAscore(x1_new,x2_new);
% using DP-PCA using Gaussian Mechanism (C Dwork 2014)
[U,~,~] = svd(C_hat);
DPPCA.U = U(:,1:K);
x_new = DPPCA.U'*x;
[idxDPPCA,C_DPPCA] = kmeans(x_new,2);
DPPCA.CHindex = myCHindex(x_new',C_DPPCA',idxDPPCA);
sDPPCA = silhouette(x_new',idxDPPCA');
DPPCA.STindex = mySTindex(sDPPCA,idxDPPCA',K);
%% helper functions
function index = mySTindex(S,idx,K)
% index - the Silhouette index (varies from -1 to +1, higher means better
% clustering is done)
% index - Silhouette values of the samples (n-by-1 vector, n - number of samples)
% idx - cluster labels of all the samples (n-by-1 vector)
% K - number of clusters
avg_sil = zeros(K,1);
for k = 1:K
idsk = find(idx==k);
avg_sil(k) = sum(S(idsk))/length(idsk);
end
index = sum(avg_sil)/K;
return
function index = myCHindex(X,C,idx)
% index - the Calinski-Harabasz index
% X - data samples, n-by-d matrix, d feature dimension, n number of samples
% C - cluster centers, K-by-d matrix, K is the cluster number
% idx - cluster labels of all the samples, n-by-1 matrix
[K,~] = size(C);
n = length(idx);
z = mean(C);
trB = 0;
trW = 0;
for k = 1:K
idsk = find(idx==k);
nk = length(idsk);
zk = C(k,:);
trB = trB + nk*norm(zk - z)^2;
temp = X(idsk,:) - repmat(zk,nk,1);
for i = 1:nk
trW = trW + norm(temp(i,:));
end
end
index = (trB/(K-1))/(trW/(n-K));
return
function [score1, score2] = myCCAscore(X_new,Y_new)
% this function computes two scores to measure the performance of the CCA
% score1 = norm(X_new - Y_new,'fro') => the term we want to minimize from Raman Arora paper
% score2 = trace((1/N)*(X_new*Y_new')) => the total canonical correlation
% X_new = samples from View1 after projection onto the propoer CCA subspace
% Y_new = samples from View2 after projection onto the propoer CCA subspace
[~,n] = size(X_new);
score2 = trace((1/n)*(X_new*Y_new'));
score1 = (1/n)*norm(X_new - Y_new,'fro');
return
% this kmeans function is taken from the tensorlab toolbox
% Authors: Laurent Sorber (Laurent.Sorber@cs.kuleuven.be)
% Nico Vervliet (Nico.Vervliet@esat.kuleuven.be)
% Marc Van Barel (Marc.VanBarel@cs.kuleuven.be)
% Lieven De Lathauwer (Lieven.DeLathauwer@kuleuven-kulak.be)
%
% References:
% [1] J. B. MacQueen, "Some Methods for Classification and Analysis of
% MultiVariate Observations," in Proc. of the fifth Berkeley
% Symposium on Mathematical Statistics and Probability, L. M. L. Cam
% and J. Neyman, eds., Vol. 1, UC Press, 1967, pp. 281-297.
% [2] D. Arthur and S. Vassilvitskii, "k-means++: The Advantages of
% Careful Seeding," Technical Report 2006-13, Stanford InfoLab, 2006
function [L,C] = kmeans(X,k,dist)
%KMEANS Cluster multivariate data using the k-means++ algorithm.
% [L,C] = kmeans(X,k) produces a 1-by-size(X,2) vector L with one class
% label per column in X and a size(X,1)-by-k matrix C containing the
% centers corresponding to each class.
%
% kmeans(X,k,'norm') and kmeans(X,k,'angle') use the Euclidian distance
% and angle, respectively, as metrics for the intra-point distance.
%
if nargin < 3, dist = 'norm'; end
if strcmpi(dist,'angle'), X = bsxfun(@rdivide,X,sqrt(dot(X,X,1))); end
L = [];
L1 = 0;
c = 0; cmax = 10;
while length(unique(L)) ~= k && c<cmax
% The k-means++ initialization.
C = X(:,randi(size(X,2),1));
L = ones(1,size(X,2));
for i = 2:k
D = X-C(:,L);
D = sqrt(dot(D,D,1));
C(:,i) = X(:,find(rand < cumsum(D)/sum(D),1));
switch dist
case 'norm'
[~,L] = max(bsxfun(@minus,2*real(C'*X),dot(C,C,1).'),[],1);
case 'angle'
[~,L] = max(real(C'*X),[],1);
end
end
% The k-means algorithm.
d = 0; dmax = 10;
while any(L ~= L1) && d<dmax
L1 = L;
for i = 1:k, l = L==i; C(:,i) = sum(X(:,l),2)/sum(l); end
switch dist
case 'norm'
[~,L] = max(bsxfun(@minus,2*real(C'*X),dot(C,C,1).'),[],1);
case 'angle'
C = bsxfun(@rdivide,C,sqrt(dot(C,C,1)));
[~,L] = max(real(C'*X),[],1);
end
d = d+1;
end
c = c+1;
end
return
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment