ams.m0000644000076500007650000003265210720336543011711 0ustar chapchap00000000000000function [sig,C,alpha,b] = ams(X,Y,method,sphere,opt) % AMS Automatic Model Selection for Kernel Methods with % - RBF Gaussian kernel % - L2 penalization of the errors % For classification: Support Vector Machines % For regression: Kernel Ridge Regression % % Usage: [sig,C,alpha,b] = AMS(X,Y,method,[sphere],[opt]) % % X: is an n times d matrix containing n training points in d dimensions % Y: is a n dimensional vector containing either the class labels % (-1 or 1) in classification or the targets values in regression. % method is either: % 'loo': minimizes the leave-one-out error. The LOO is an accurate % estimate of the generalization error but might be difficult % to minimize (not smooth). Also, do not use with if sphere = 0 % and large d (long time for gradient calculations). % 'rm': radius / margin bound. Easier to minimize. That can be the % first thing to try % 'val': minimizes a validation error. Good if there are a lot % a training points. The validation points are the % nv points of the training set [nv specified in the options] % 'ev': evidence maximization. % sphere: if 1, spherical RBF kernel = exp(-0.5*||x-y||^2 / sigma^2) % if 0, one sigma per component: % exp(-0.5*sum (x_i-y_i)^2/sigma_i^2) % [default 1] % opt: structure containing the options (in brackets default value) % itermax: maximum number of conjugate gradient steps [100] % tolX: gradient norm stopping criterion [1e-5] % hregul: regularization on the hyperparameters (to avoid extreme values) [0.01] % nonorm: if 1, do no not normalize the inputs [0] % verb: verbosity [1] % eta: smoothing parameter used in classif by 'ev' and 'loo' % larger is smoother [0.1] % sigmoid: slope of the sigmoid used by 'val' and 'loo' in classif [5] % nv: number of validation points for 'val' [n/2] % % % sig: the sigma(s) found by the model selection. Scalar if % sphere = 1, d dimensional vector otherwise % C: the constant penalizing the training errors % alpha,b: the parameters of the prediction function % f(x) = sum alpha_i K(x,x_i) + b % Copyright Olivier Chapelle % olivier.chapelle@tuebingen.mpg.de % last modified 21.04.2005 % Initialization [n,d] = size(X); if (size(Y,1)~=n) | (size(Y,2)~=1) error('Dimension error'); end; if all(abs(Y)==1) classif = 1; % We are in classification else classif = 0; % We are in regression end; if nargin < 5 % Assign the options to their default values opt = []; end; if ~isfield(opt,'itermax'), opt.itermax = 100; end; if ~isfield(opt,'length'), opt.length = opt.itermax; end; if ~isfield(opt,'tolX'), opt.tolX = 1e-5; end; if ~isfield(opt,'hregul'), opt.hregul = 1e-2; end; if ~isfield(opt,'nonorm'), opt.nonorm = 0; end; if ~isfield(opt,'verb'), opt.verb = 0; end; if ~isfield(opt,'eta'), opt.eta = 0.1; end; if ~isfield(opt,'sigmoid'), opt.sigmoid = 5; end; if ~isfield(opt,'nv'), opt.nv = round(n/2); end; if nargin<4 sphere = 1; end; % Normalization if ~opt.nonorm & (norm(std(X)-1) > 1e-5*d) X = X./repmat(std(X),size(X,1),1); fprintf(['The data have been normalized (each component is divided by ' ... 'its standard deviation).\n Don''t forget to do the same for the ' ... 'test set.\n']); end; % Default values of the hyperparameters (in log scale) C = 0; sig = .3*var(X)'; if ~sphere, sig = log(d*sig) / 2; else sig = log(sum(sig)) / 2; end; % Do the optimization on the hyperparameters param = [sig; C]; default_param = param; % check_all_grad(param,X,Y,classif,default_param,opt); return; % plot_criterion(X,Y,param,[-3:0.1:3],classif,default_param,opt); % return; [param,fX] = minimize(param,@obj_fun,opt,X,Y,method,classif,default_param,opt); % Prepare the outputs [alpha, b] = learn(X,Y,param,classif); C = exp(param(end)); sig = exp(param(1:end-1)); function plot_criterion(X,Y,param,range,classif,opt) for i=1:length(range) fprintf('%f\r',range(i)); for j=1:2 param2 = param; param2(end-j+1) = param2(end-j+1)+range(i); obj(1,i,j) = obj_fun(param2,X,Y,'loo',classif,opt); obj(2,i,j) = obj_fun(param2,X,Y,'rm', classif,opt); obj(3,i,j) = obj_fun(param2,X,Y,'val',classif,opt); obj(4,i,j) = exp(obj_fun(param2,X,Y,'ev', classif,opt)); end; end; figure; plot(obj(:,:,1)'); figure; plot(obj(:,:,2)'); function check_all_grad(param,X,Y,classif,opt) param = randn(size(param)); checkgrad(@obj_fun,param,1e-6,X,Y,'loo',classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'rm', classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'val',classif,opt) checkgrad(@obj_fun,param,1e-6,X,Y,'ev', classif,opt) function [obj, grad] = obj_fun(param,X,Y,meth,classif,default_param,opt) % Compute the model selection criterion and its derivatives with % respect to param which is a vector containing in log scale sigma % and C % Add a regularization on the hyperparameters to avoid that % they take extreme values obj0 = opt.hregul * mean((param-default_param).^2); grad0 = 2*opt.hregul * (param-default_param) / length(param); if obj0 > 1 % Extreme value of hyperparamaters. The rest obj = obj0; % might not even work because of numercial grad = grad0; % instabilities -> exit return; end if strcmp(meth,'val') % Remove the validation set for the learning [alpha,b] = learn(X(1:end-opt.nv,:),Y(1:end-opt.nv),param,classif); else [alpha,b] = learn(X,Y,param,classif); end; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Compute the objective function % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sv = find(alpha~=0); % Find the set of support vectors switch lower(meth) % % Leave-one-out % case 'loo' K = compute_kernel(X(sv,:),[],param); K(end+1,:) = 1; % To take the bias into account K(:,end+1) = 1; K(end,end) = 0; D = opt.eta./abs(alpha(sv)); invKD = inv(K+diag([D;0])); invKD(:,end) = []; invKD(end,:) = []; span = 1./diag(invKD) - D; % "Span" of the support vectors % Estimated change of the output of each point when it's % removed from the training set. change = alpha(sv).*span; if classif out = 1-Y(sv).*change; obj = sum(tanh(-opt.sigmoid*out)+1)/2/length(Y); else obj = mean(change.^2); end; invK = inv(K); invK(:,end) = []; invK(end,:) = []; % % Radius margin % case 'rm' K = compute_kernel(X,[],param); w2 = sum(alpha.*Y); if classif R2 = mean(diag(K)) - mean(mean(K)); % Radius estimated by variance else K(end+1,:) = 1; % Radius = mean span K(:,end+1) = 1; K(end,end) = 0; invK = inv(K); span = 1./diag(invK); R2 = mean(span(1:end-1)); end; obj = R2*w2 / length(Y); % % Validation set % case 'val' K = compute_kernel(X(1:end-opt.nv,:),X(end-opt.nv+1:end,:),param); Yv = Y(end-opt.nv+1:end); out = K(end-opt.nv+1:end,:)*alpha+b; % Output on the validation points if classif % Goes through sigmoid because step function is not differentiable obj = mean(tanh(-opt.sigmoid*out.*Yv)+1)/2; else obj = mean((out-Yv).^2); end; Kl = K(sv,sv); Kl(end+1,:) = 1; Kl(:,end+1) = 1; Kl(end,end) = 0; invK = inv(Kl); % % Evidence maximization % case 'ev' K = compute_kernel(X,[],param); if classif % Normally, K should be computed only on the support vectors. But % then, the evidence wouldn't be continuous. We thus make a soft % transition for small alphas. weight = min(0, (alpha.*Y)/(mean(alpha.*Y))/opt.eta - 1); weight2 = 1 - weight.^2; K0 = K; K = K.*(max(weight2*weight2',eye(length(weight2)))); Kl = K0(sv,sv); Kl(end+1,:) = 1; Kl(:,end+1) = 1; Kl(end,end) = 0; invKl = inv(Kl); invKl(:,end) = []; invKl(end,:) = []; end; [invK, logdetK] = inv_logdet_pd_(K); obj = logdetK/length(K) + log(mean(alpha.*Y)); otherwise error('Unknown model selection criterion'); end; %%%%%%%%%%%%%%%%%%%%%%%%% % Compute the gradients % %%%%%%%%%%%%%%%%%%%%%%%%% for i=1:length(param) switch lower(meth) % % Leave-one-out % case 'loo' dK = compute_kernel(X(sv,:),[],param,i); dalpha = -invK * (dK*alpha(sv)); dD = - opt.eta * dalpha.*sign(alpha(sv))./alpha(sv).^2; dspan = diag(invKD).^(-2).* ... diag(invKD*(dK+diag(dD))*invKD) - dD; dchange = dalpha.*span + alpha(sv).*dspan; if classif dout = -Y(sv).*dchange; grad(i) = -0.5*opt.sigmoid * ... sum(cosh(-opt.sigmoid*out).^(-2).*dout)/length(Y); else grad(i) = 2*mean(change.*dchange); end; % % Radius margin % case 'rm' dK = compute_kernel(X,[],param,i); dw2 = -alpha'*dK*alpha; if classif dR2 = mean(diag(dK)) - mean(mean(dK)); else dspan = diag(invK).^(-2).*diag(invK(:,1:end-1)*dK*invK(1:end-1,:)); dR2 = mean(dspan(1:end-1)); end; grad(i) = (dR2*w2 + R2*dw2) / length(Y); % % Validation set % case 'val' dK = compute_kernel(X(1:end-opt.nv,:),X(end-opt.nv+1:end,:),param,i); dalpha = -invK(:,1:end-1) * (dK(sv,:)*alpha); db = dalpha(end); dalpha = dalpha(1:end-1); dout = K(end-opt.nv+1:end,sv)*dalpha + db + ... dK(end-opt.nv+1:end,:) * alpha; if classif grad(i) = -0.5*opt.sigmoid * ... mean(cosh(-opt.sigmoid*out.*Yv).^(-2).*dout.*Yv); else grad(i) = 2*mean((out-Yv).*dout); end; % % Evidence maximization % case 'ev' dK = compute_kernel(X,[],param,i); dK0 = dK; if classif dalpha = zeros(size(alpha)); dalpha(sv) = -invKl * (dK(sv,:)*alpha); dweight = -2/opt.eta * weight.* ... (dalpha.*Y/mean(alpha.*Y) - (alpha.*Y) * mean(dalpha.*Y)/mean(alpha.*Y)^2); dK = dK .* max(weight2*weight2',eye(length(weight))) + ... (K0-diag(diag(K0))) .* (weight2*dweight' + dweight*weight2'); end; grad(i) = invK(:)'*dK(:)/length(K) - ... alpha'*dK0*alpha / sum(alpha.*Y); end; end; obj = obj0 + obj; grad = grad0 + grad'; function [alpha,b] = learn(X,Y,param,classif) % Do the actual learning (SVM or kernel ridge regression) K = compute_kernel(X,[],param); svset = 1:length(Y); old_svset = []; iter = 0; itermax = 20; % If the set of support vectors has changed, we need to % reiterate. Note that for regression, all points are support % vectors and there is only one iteration. while ~isempty(setxor(svset,old_svset)) & (iter max(alpha)*eps & alpha < C*(1-eps)); sv2 = find(alpha > C*(1-eps)); % Degenerate case: if sv1 is empty, then we assume nothing changes % (loo = training error) if isempty(sv1) loo = mean(output < 0); return; end; % Compute the invert of KSV l = length(sv1); KSV = [[K(sv1,sv1) ones(l,1)]; [ones(1,l) 0]]; % a small ridge is added to be sure that the matrix is invertible invKSV=inv(KSV+diag(1e-12*[ones(1,l) 0])); % Compute the span for all support vectors. n = length(K); % Number of training points span = zeros(n,1); % Initialize the vector tmp = diag(invKSV); span(sv1) = 1./tmp(1:l); % Span of sv of first category % If there exists sv of second category, compute their span if ~isempty(sv2) V = [K(sv1,sv2); ones(1,length(sv2))]; span(sv2) = diag(K(sv2,sv2)) - diag(V'*invKSV*V); end; % Estimate the fraction of loo error loo = mean(output - alpha.*span < 0); kernel_combination.m0000644000076500007650000000732510720336604014770 0ustar chapchap00000000000000function D = kernel_combination(K_,Y) % D = KERNEL_COMBINATION(K,Y) % Learn a combination of kernel given by the 3D matrix K % Postive weights are returned in D [note that length(D) = size(K,3)] % The kernels need to be centered and normalized global K; K = K_; clear K_; % Global variable to gain speed n = length(Y); % Number of points d = size(K,3); % Number of kernels % Check the kernels for i=1:d K1 = K(:,:,i); if abs(sum(K1(:))) > 1e-5 error('Kernel not centered in feature space'); end; if abs(trace(K1)- n) > 1e-5 error('Kernel not normalized'); end; end; D=ones(d,1) / d; % Initialization: put equal weight to all kernels options = optimset('display','off','LargeScale','off'); % Do the optimization: % minimize the SVM objective function under constraints D >= 0 and sum(D)=1 % For that, compute the gradient and the Hessian of the SVM objective % function with respect to D and make a constrained Newton step. % This step is computed with quadprog (need the optimization toolbox). while 1 D(D<1e-10) = 0; [w,grad,hess] = obj_fun(D,Y); hess = (hess+hess')/2 + 1e-5*eye(size(hess)); % Because of numerical errors fprintf('Obj fun = %f, L0 norm of the combination = %d \n',w,sum(D>0)); % Find the search direction. % If the problem were unconstrained, it would be hess \ grad (Newton step). [S,fval] = quadprog(hess,grad,[],[],ones(1,d),0,-D,[],[],options); if fval > -1e-5*w break; end; % Stop when the relative increase to the % objective function is small. % Back tracking if necessary [usually, lambda = 1] lambda = 1; non_convex=0; while 1 w1 = obj_fun(D+lambda*S,Y); if w10, S=['Linesearch']; else S=['Function evaluation']; end i = 0; % zero the run length counter ls_failed = 0; % no previous line search has failed fX = []; [f1 df1] = feval(f,X,varargin{:}); % get function value and gradient i = i + (length<0); % count epochs?! s = -df1; % search direction is steepest d1 = -s'*s; % this is the slope z1 = red/(1-d1); % initial step is red/(|s|+1) while i < abs(length) % while not finished i = i + (length>0); % count iterations?! X0 = X; f0 = f1; df0 = df1; % make a copy of current values X = X + z1*s; % begin line search [f2 df2] = feval(f,X,varargin{:}); i = i + (length<0); % count epochs?! if df2'*df2 < param.tolX break; % We are at the minimum end; d2 = df2'*s; f3 = f1; d3 = d1; z3 = -z1; % initialize point 3 equal to point 1 if length>0, M = MAX; else M = min(MAX, -length-i); end success = 0; limit = -1; % initialize quanteties while 1 while ((f2 > f1+z1*RHO*d1) | (d2 > -SIG*d1)) & (M > 0) limit = z1; % tighten the bracket if f2 > f1 z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3); % quadratic fit else A = 6*(f2-f3)/z3+3*(d2+d3); % cubic fit B = 3*(f3-f2)-z3*(d3+2*d2); z2 = (sqrt(B*B-A*d2*z3*z3)-B)/A; % numerical error possible - ok! end if isnan(z2) | isinf(z2) z2 = z3/2; % if we had a numerical problem then bisect end z2 = max(min(z2, INT*z3),(1-INT)*z3); % don't accept too close to limits z1 = z1 + z2; % update the step X = X + z2*s; [f2 df2] = feval(f,X,varargin{:}); M = M - 1; i = i + (length<0); % count epochs?! d2 = df2'*s; z3 = z3-z2; % z3 is now relative to the location of z2 end if f2 > f1+z1*RHO*d1 | d2 > -SIG*d1 break; % this is a failure elseif d2 > SIG*d1 success = 1; break; % success elseif M == 0 break; % failure end A = 6*(f2-f3)/z3+3*(d2+d3); % make cubic extrapolation B = 3*(f3-f2)-z3*(d3+2*d2); z2 = -d2*z3*z3/(B+sqrt(B*B-A*d2*z3*z3)); % num. error possible - ok! if ~isreal(z2) | isnan(z2) | isinf(z2) | z2 < 0 % num prob or wrong sign? if limit < -0.5 % if we have no upper limit z2 = z1 * (EXT-1); % the extrapolate the maximum amount else z2 = (limit-z1)/2; % otherwise bisect end elseif (limit > -0.5) & (z2+z1 > limit) % extraplation beyond max? z2 = (limit-z1)/2; % bisect elseif (limit < -0.5) & (z2+z1 > z1*EXT) % extrapolation beyond limit z2 = z1*(EXT-1.0); % set to extrapolation limit elseif z2 < -z3*INT z2 = -z3*INT; elseif (limit > -0.5) & (z2 < (limit-z1)*(1.0-INT)) % too close to limit? z2 = (limit-z1)*(1.0-INT); end f3 = f2; d3 = d2; z3 = -z2; % set point 3 equal to point 2 z1 = z1 + z2; X = X + z2*s; % update current estimates [f2 df2] = feval(f,X,varargin{:}); M = M - 1; i = i + (length<0); % count epochs?! d2 = df2'*s; end % end of line search if success % if line search succeeded f1 = f2; fX = [fX' f1]'; if param.verb > 1 fprintf('\t%s %6i; Value %4.6e\t Gradient norm %4.6e\r', ... S, i, f1, df2'*df2); end; s = (df2'*df2-df1'*df2)/(df1'*df1)*s - df2; % Polack-Ribiere direction tmp = df1; df1 = df2; df2 = tmp; % swap derivatives d2 = df1'*s; if d2 > 0 % new slope must be negative s = -df1; % otherwise use steepest direction d2 = -s'*s; end z1 = z1 * min(RATIO, d1/(d2-realmin)); % slope ratio but max RATIO d1 = d2; ls_failed = 0; % this line search did not fail else X = X0; f1 = f0; df1 = df0; % restore point from before failed line search if ls_failed | i > abs(length) % line search failed twice in a row break; % or we ran out of time, so we give up end tmp = df1; df1 = df2; df2 = tmp; % swap derivatives s = -df1; % try steepest d1 = -s'*s; z1 = 1/(1-d1); ls_failed = 1; % this line search failed end end if param.verb > 1 fprintf('\n'); end; test_ams.m0000644000076500007650000000313210720336557012744 0ustar chapchap00000000000000function test_ams() n = 200; % Number of training points nt = 1000; % Number of test points % We construct a 2D toy problem which looks like a checkboard. % But there is more variation on the 2nd component (and the length % scale should be smaller) X = rand(n,2); Y = sign((X(:,1)-0.5).*(mod(3*X(:,2),1)-0.5)); Xt = rand(nt,2); Yt = sign((Xt(:,1)-0.5).*(mod(3*Xt(:,2),1)-0.5)); % Normalize by the standard deviation on each component s = std(X); X = X./repmat(s,n,1); Xt = Xt./repmat(s,nt,1); % Plot the toy problem plot(X(Y== 1,1),X(Y== 1,2),'b+'); hold on; plot(X(Y==-1,1),X(Y==-1,2),'ro'); hold off; pause(1); drawnow; % We now try the four different model selection criterion meth = {'loo' 'rm' 'val' 'ev'}; for i=1:4 [sig,C,alpha,b] = ams(X,Y,meth{i},0); % Do the model selection Kt = compute_kernel(Xt,X,sig); % Compute the test kernel... te = mean(Yt.*(Kt*alpha+b)<0); % ... and the test error fprintf('Method = %s,\t sigma_1 = %.3f, sigma_2 = %.3f, test error = %.3f\n',... meth{i},sig(1),sig(2),te); end; fprintf(['All methods give comparable error rates and find that the ' ... 'length scale should be smaller on the 2nd component.\n']); function K = compute_kernel(X,Y,sig) % Compute the RBF kernel matrix if length(sig)==1 X = X / sig; Y = Y / sig; else X = X ./ repmat(sig',size(X,1),1); Y = Y ./ repmat(sig',size(Y,1),1); end; normX = sum(X.^2,2); normY = sum(Y.^2,2); K = exp(-0.5*(repmat(normX ,1,size(Y,1)) + ... repmat(normY',size(X,1),1) - 2*X*Y')); test_combination.m0000644000076500007650000000361310720336617014467 0ustar chapchap00000000000000function test_combination() % Construct a toy problem where the scale of the data is not the same % at different location in space. Ideally, one should combine two RBF kernels with % different bandwidths n = 50; X1 = randn(n,2); Y1 = sign(sum(X1.^2,2)-0.7); X2 = 10*randn(n,2) + repmat([10 10],n,1);; Y2 = sign(sum((X2-10).^2,2)-50); X = [X1; X2]; Y = [Y1; Y2]; plot(X(Y== 1,1),X(Y== 1,2),'b+'); hold on; plot(X(Y==-1,1),X(Y==-1,2),'ro'); hold off; pause(1); drawnow; % The base kernels are RBF kernels with various bandwidths. sig = 2.^[-2:4]; for i=1:length(sig) K(:,:,i) = compute_kernel(X,X,sig(i)); end; % And one of them is a ridge (to penalize the training errors). K(:,:,end+1) = eye(2*n); % Normalize the kernels (centering in feature space and variance 1) K = normalize_kernel(K); % Compute the linear combination D = kernel_combination(K,Y); % Plot the result figure; bar([1:length(sig) length(sig)+2],D); lab = num2str(sig'); lab(end+1,1:5)='ridge'; set(gca,'XTickLabel',lab); xlabel('Bandwidth'); ylabel('Weight'); fprintf(['The convex combination puts more weight on small and large ' ... 'bandwidths, according to the structure of the problem.\n']); function K = normalize_kernel(K) % Normalize the kernels (centering in feature space and variance 1) n = size(K,1); for i=1:size(K,3) K1 = K(:,:,i); S = mean(K1,2); K1 = K1 - S*ones(1,n) - ones(n,1)*S' + mean(S); K1 = K1 / mean(diag(K1)); K(:,:,i) = K1; end; function K = compute_kernel(X,Y,sig) % Compute the RBF kernel matrix if length(sig)==1 X = X / sig; Y = Y / sig; else X = X ./ repmat(sig',size(X,1),1); Y = Y ./ repmat(sig',size(Y,1),1); end; normX = sum(X.^2,2); normY = sum(Y.^2,2); K = exp(-0.5*(repmat(normX ,1,size(Y,1)) + ... repmat(normY',size(X,1),1) - 2*X*Y')); README0000644000076500007650000000011710720336706011623 0ustar chapchap00000000000000For more details, see: http://www.kyb.tuebingen.mpg.de/bs/people/chapelle/ams/