Figure 4.12:

The sum of squares validation error for PCR and PLSR versus the number of principal components/latent variables \ell ; note that only two latent variables are required versus four principal components.

Code for Figure 4.12

Text of the GNU GPL.

main.m


  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
n = 100;
p = 2;
q = 5;

% reproducible data!
rand("seed",-1);

% create the data set
% true parameters
B = [zeros(3,2); [-1 1]; [0 0]];

X = rand(n, q-1);
% add a nearly dependent final column
xq = X*ones(q-1,1)/sqrt(q-1) + 0.01*rand(n,1);

% fitting data
X = [X, xq];
Y = X*B + 0.1*rand(n,p);

% cross validation data
X2 = rand(n, q);
Y2 = X2*B + 0.1*rand(n,p);

Xdat = [X;X2]; Ydat = [Y;Y2];

% write out low precision data for posting on website
myfile = fopen('pca_pls_data.dat', 'w');
fprintf(myfile, '%     x1      x2      x3     x4      x5      y1      y2')
fprintf(myfile, '\n')
for i = 1: 2*n
  fprintf(myfile, '%8.3f', Xdat(i,:), Ydat(i,:))
  fprintf(myfile, '\n')
end
fclose(myfile);

% reload low precision data so we estimate with the same data

Data = load ('pca_pls_data.dat');
X = Data(1:n,1:q); Y = Data(1:n,q+1:q+p);
X2 = Data(n+1:end,1:q); Y2 = Data(n+1:end,q+1:q+p);


% full least squares
Bls = inv(X'*X)*X'*Y;

% PCR or SVD
[U,S,V] = svd(X);
% check all numbers of principal components (singular values)
Bsvd(1:q) = {zeros(q,p)};
Ysvd(1:q) = {zeros(n,p)};
for l = 1:q
  Ul=U(:,1:l); Sl=S(1:l,1:l); Vl=V(:,1:l);
  Bsvd(l) = Vl*inv(Sl)*Ul'*Y;
% compare prediction on cross-validation data
  Ysvd(l) = {X*cell2mat(Bsvd(l))};
  errsvd(l) = norm(Y-cell2mat(Ysvd(l)));
end

%PLS algorithm
U = []; T=[]; P=[]; Q=[];
E = X; F = Y;
Bpls(1:q) = {zeros(q,p)};
Ypls(1:q) = {zeros(n,p)};
for i = 1:q
  [UU,SS,VV] = svd(E'*F);
  u = UU(:,1); v=VV(:,1);
  t = E*u; w = F*v;
  t = t/sqrt(t'*t);
  pp = E'*t; qq = F'*t;
  E = E - t*pp'; F = F - t*qq';
  U = [U u]; T = [T t]; P = [P pp]; Q=[Q qq];
  R = U / (P'*U); % i.e., R = inv(P'U)*U
  %Bpls = R*inv(T'*T)*T'*Y
  Bpls(i) = {R*Q'};
  Ypls(i) = {X*cell2mat(Bpls(i))};
  errpls(i) = norm(Y-cell2mat(Ypls(i)));
end

figure()
plot(1:q, errsvd, '-o', 1:q, errpls, '-+')

%cross validation
for i = 1:q
  Y2svd(i) = {X2*cell2mat(Bsvd(i))};
  err2svd(i) = norm(Y2-cell2mat(Y2svd(i)));
  Y2pls(i) = {X2*cell2mat(Bpls(i))};
  err2pls(i) = norm(Y2-cell2mat(Y2pls(i)));
end

figure()
plot(1:q, err2svd, '-o', 1:q, err2pls, '-+')

for i = 1:q
  figure()
  plot(Y2(:,1), cell2mat(Y2pls(i))(:,1), 'o')
end
for i = 1:q
  figure()
  plot(Y2(:,1), cell2mat(Y2svd(i))(:,1), 'o')
end

% save data for book figure()

tab1 = [(1:q)', errsvd'];
tab2 = [(1:q)', errpls'];
tab3 = [Y, cell2mat(Ysvd)];
tab4 = [Y, cell2mat(Ypls)];
tab5 = [(1:q)', err2svd'];
tab6 = [(1:q)', err2pls'];
tab7 = [Y2, cell2mat(Y2svd)];
tab8 = [Y2, cell2mat(Y2pls)];

save pca_pls.dat tab1 tab2 tab3 tab4 tab5 tab6 tab7 tab8