data.mat0000644061573100372000000001340011167120202011033 0ustar fd258divfMATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Aug 17 16:09:09 2008 IM x-{4 v%mC!b"H%L)RJT0Kn5L0=3ی11BEZJM5%!k>||.$b>A͹?w?iќd S˹尿/Gͅ`VipP#v.6e5!1yRno 6ʅ"-t؂0Kō7#v(Y~#}jfz!o 1S 2;p<2tL]RgCP*xU+V)Gŝ n$,kt |J3/peN%Pj'$t\ѾOf*uHBx>8 C|%LjZC)rpRYn-` f.ԭBok`a-pM/ˊR.;IϦxPg[٧7(Ptguۚ:xm`s'hYxԀ>H- 5\ajɜ+plrfg,A)bBRz 6uI@$۝eVԻ%dh[a=ᧈv|쒒:.S\]V!PC(_eh\ΖQqwn saTuי8<R%YQcHXx"0": b`ґ|-$UkA!ڒ CkH(mshY {f?JA;A'iufw-Jnx!, _.;SːP1O- aة*F:P' 6᭏SQh4>2.J_2CS77h4b r{.ӎ1p &lx/vl;r}S_[O!}R}~g\svJ^6y[q~ Hܲ>2^Y;<ޯ<>Uڣ|8 ){nCt G0`[} 5TEt.,^R 68xV]Q &R=rF6A|bPWo(E#H?0㩺ݭvcne7 .>0W7+(>*is9P=;[Gb,$+-'8mS USGmtĮ@_Wʉ-]F\̜-R#*z$>տRwom66[GƖ7,>KF@u+$ǣX\QNcUtb8ـ ^ӟS *s yc]"NV!!޶M9 2 oE^[Q= G=n.̬C38X:Hv>6(월^*Tw['6Tr_8 =CJt ChC}t<^)0[b+]VZ(^ceSG&|suKu}VQk",<F qF0! I/H@;"Wa[>ZAl_. ($xc``0b6 31#f"@P$xc``0b6 31#7f,@$xc``0b6 31# f@'xc``b6 3"aXbNAF"f4z0xc``pb6 3"av .LMOd[6m{n`-xc``pb6 3"av .LMcϚ ;p)Jxc``b6 31 ff(,f5w_*۟|5c 3ӢΔX>`s @xm1 Qɇ9n]SGi%($){X4I\Vx O6f8xc``(b6 3#fb0!pC0`%( ujI x-U 4 5E,!ʖTu'I$"GJR)d-$$KF 3(<(#e >o{=ssϹߕ'HH%(C+HLQbRuz~?'O=E_[@g59[ ~! /yJ? Y]n8| qS Qwr#`}*ʗExӮ>J!k8Y&Hr2+<-xnQv4N:*m@I&oFA*P\ԃL+agXK1›9QP?u lݦO쾳q=l!{{[/lLחAI+p;&]3 ?,2+@9'4ʶ xq(v넺 m֦?Eƾej?@>ke">VT+&C(Vn=sMDlX&gArq/Cuws!,٩(|X4(E@Ϭ8X7JE*Ib[S!]L ssG͚}*-hj>)=;Β%>q kR ^ڧݿ^kZsAe##Za3ٮ7|!$'Ig8TL}_ չzPD 11H@yE|@N h+6bRׅd<>^{ՌFY5y﫜``Lʷi <8QѬYMK!7.ُƀ=?_ҡޝ;(V9I<'R+i~D,ev D']hRa hXC6(M]ݛ)&.,Z {_L;p~aۜa~lbgz\<1,V|OאU<{^cOl;Ce#D8}='LqS|dZ][s1dpƎ't+5jPbCEX ڜZ776m`r-ODDjhNZ3)ŢY;X9Ӫ͕ƗBMn ˄>W<5⩓WP`_!k/}# .~ʚ@%Ż 54zF#·@|+ڊsys^ϼѸ 7*;vdgmA-D m:Siv}y79F.`)L ;&ƜV[]Y枲dX̽pH.,ᵻf%}6/ҧbF8|_V+pf`jPO^aVJ1QSKX!hL14Kdt~.u,;Ȍ{x>@R1J՝i}J6sʻj:^qI/|`3 (Pay<&^M)49̙ \Yk!LY?/Y]ő-X)sNwq/(0+ ^1ۗ9({9[CO[ =j+oMp'Zl>jwN ;LELH!f, ݜg|pZe7p\#*7Š0vZb_LC]9^cE+Z^Z+_D|Ԃ0q1ދ|_q1 +驾-zOejpl[U$hgP| X:FRUBZ{m@ƌk̠$Wn9-̘‘.o Vw cɞ[3%\zE(V!>Iz_7QF6N=΃JoP% xݒ3jA&s|>xwxX? u)o家eX`|b`sOj4]1- ef4ס˻,Z&u-\w noYMwϳom{6V,`xv.æSbQ.I?ZRbI%P )7 IN‘<>T kQlbm,37` <!8%ԨuT&컰Aμ|6l)qkZW\J+d Bho+\V^ȤT4-P9-3T;۱Q[Wh\@I}=rx6<;st01\=! `vI+:[)j+N%lՊ_JftLJdQ0랺4K MttJZ9(3)h,jh%#>-?<eR~PIi@%x?ioCgᵙ#ɨom*GW2Ql+@,y /wu-> S~~}%%=r́ hK̫ ,x#yp~ }kE0gxc``(`d`` X| bF& ہ$@Jcbd`p T32`D7EcB8"v):& 10000 fprintf('Finished %d iterations\n', iter); break end end % loop over single restart fprintf('\n'); % ---- store the results in a cell array for later analysis ---- % % If you do not care about intermediate values, this storage % increases the amount of memory needed and can slow down the % algorithm. You might instead want to only pass back the final % values. model_set{ restart_index } = model; lower_bounds_log_set{ restart_index } = lower_bounds_log; final_lower_bound_set( restart_index ) = lower_bounds_log(1, end); end % loop over restarts % ---- find the best run ---- % [ max_lb max_iter ] = max( final_lower_bound_set ); model = model_set{ max_iter }; lower_bounds_log = lower_bounds_log_set{ max_iter }; fprintf('\n----------------------------------------------\n'); fprintf('Best restart on training data: (%d, %.2f)\n\n', max_iter, max_lb); % ---- store which run was best for the output ---- % out.model_set = model_set; out.lower_bounds_log_set = lower_bounds_log_set; out.final_lower_bound_set = final_lower_bound_set; out.best_model = model; out.best_lower_bounds_log = lower_bounds_log; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Display the predicted pi and Z, the features, and the predicted % denoised data. if param_set.show_final_plots % Get predicted Z Z_predicted = model.nu; if param_set.use_finite_model pi_predicted = model.tau(1,:) ./ sum( model.tau,1); else pi_predicted = v2pi( model.tau(1,:) ./ sum( model.tau,1) ); end A_predicted = model.phi_mean; K_pred = size(Z_predicted, 2); % compute color scale image_scale = [ min( [ min(A_predicted(:)) min(X(:)) 0 ] ) ... max( [ max(A_predicted(:)) max(X(:)) 1 ] ) ]; % Show the predicted features and their weights. Sort in LOF % for easier visual comparison. figure(1) clf; [tmp sorted_indices] = sortrows((1-Z_predicted)'); imagesc([pi_predicted(sorted_indices); Z_predicted(:,sorted_indices)]); colormap gray if K_pred > 0 axis image; end title('Predicted \pi and Z'); % Show the image features figure(2) clf; colormap gray; for k = 1:size(A_predicted,2) subplot(ceil(sqrt(size(A_predicted,2))),ceil(sqrt(size(A_predicted,2))),k); imagesc(reshape(A_predicted(:,sorted_indices(k)),4,4), image_scale ); end title('Predicted image features'); % Show the predicted underlying data uncorrupted by noise. figure(3) clf; colormap gray for n = 1:min([9 size(X,1)]) subplot(3,3,n); if (K_pred > 0) imagesc(reshape(Z_predicted(n,:)*A_predicted',4,4), image_scale ); end end title('Predicted denoised data'); % Show the true data figure(4) clf; colormap gray for n = 1:min([9 size(X,1)]) subplot(3,3,n); imagesc(reshape(X(n,:),4,4), image_scale ); end title('True observed data'); % Plot the LB figure(5); clf; hold on; iter_count = size(lower_bounds_log,2); legends = {'Lower bound'}; plot( lower_bounds_log(1,:), 'k-'); colors = {'c.', 'm.', 'g.', 'b.', 'r.'}; labels = {'\tau optimizations', '\phi optimizations', '\nu optimizations', ... '\eta-\mu optimisations', 'iteration end', }; for i = 1:5 indices = find(lower_bounds_log(2,:) == i); if size(indices,2) > 0 plot(indices, lower_bounds_log(1,indices), colors{i}); legends{end+1} = labels{i}; end end title('Lower bound on marginal likelihood'); AX = legend(legends, 'Location', 'SouthEast'); LEG = findobj(AX,'type','text'); set(LEG,'FontSize',18); legend boxoff; fprintf(['Note that the lower bound in the plot might not' ... ' monotonically increase. This is due to tempering' ... ' and incremental introduction of the data.\n']); drawnow; end return tester.m0000644061573100372000000001002111177553432011117 0ustar fd258divfclear addpath('vibp_base/') addpath('util/') addpath('heuristics/') % load data load( 'data.mat' ); % -------------------------- % % Set Parameters % % -------------------------- % % set basic parameters of the data param_set.alpha = alpha; % IBP concentration parameter param_set.sigma_n = sigma_n; % noise variance param_set.sigma_a = sigma_a; % feature variance % set basic parameters of the algorithm param_set.K = 6; % truncation level param_set.restart_count = 5; % number of random restarts param_set.vibp_iter_count = 2; % number of inner optimisation iterations to perform between heuristic search moves param_set.use_tempering = true; % whether variances should be tempered (recommended) param_set.compute_intermediate_lb = false; % should lb be computed after each parameter update (for logging purposes) param_set.stopping_thresh = 0.1; % multiplicative difference in lower bounds to stop optimising % set output parameters param_set.show_final_plots = false; % should final Z/reconstructions be shown at end? % set held out data: test_mask( i,j ) = 1 indicates data is to be used for training. % this just demonstrates how to set up a mask to treat part of the data as unobserved. param_set.test_mask = ones( size( X ) ); test_n_start1_ind = N - floor( N / 2 ); test_n_end1_ind = N - ceil( N / 3 ); test_n_start2_ind = N - floor( N / 3 ); test_n_end2_ind = N - ceil( N / 6 ); test_n_start3_ind = N - floor( N / 6 ); test_n_end3_ind = N; param_set.test_mask( test_n_start1_ind:test_n_end1_ind , 1:3:end ) = 0; param_set.test_mask( test_n_start2_ind:test_n_end2_ind , 2:3:end ) = 0; param_set.test_mask( test_n_start3_ind:test_n_end3_ind , 3:3:end ) = 0; % set which variational methods to run param_set.do_finite_lg_variational = true; param_set.do_infinite_lg_variational = true; % ---- advanced heuristics (may be of limited use) ---- % % search moves that combine apirs of features (eg sum and difference) param_set.try_search_heuristics = true; % try search moves during optimisation param_set.search_count = 5; % how many search moves per iteration param_set.search_heuristic = 'random'; % 'random' pair,'full' - all pairs, 'correlation' - most correlated pair % adding data slowly (a form of tempering) param_set.add_data_in_blocks = true; param_set.data_block_size = 10; % how many data points to add per iteration % reorder the features by popularity between optimisation iterations - % this improves the lower bound in the infinite approach. param_set.sort_feature_order = true; % recommended for infinite approaches % varying order of optimisation updates of the parameters. param_set.vary_update_order = false; % should order of parameter updates be varied in each optimisation iteration % --------------------------% % Run Inference % % --------------------------% % finite LG variational if param_set.do_finite_lg_variational fprintf('#################################################\n'); fprintf('Running Finite LG Variational\n'); fprintf('#################################################\n'); path(path, './vibp_finite/'); param_set.use_finite_model = true; param_set.model_type = 'LG'; t = cputime; model_finite_LG_variational = run_vibp( X , param_set ); model_finite_LG_variational.time_elapsed = cputime - t; rmpath('./vibp_finite') end % infinite LG variational if param_set.do_infinite_lg_variational fprintf('#################################################\n'); fprintf('Running Infinite LG Variational\n'); fprintf('#################################################\n'); path(path, './vibp_infinite/'); param_set.use_finite_model = false; param_set.model_type = 'LG'; t = cputime; model_infinite_LG_variational = run_vibp( X , param_set ); model_infinite_LG_variational.time_elapsed = cputime - t; rmpath('./vibp_infinite') end heuristics/initialise_small_model.m0000644061573100372000000000126011177553056016504 0ustar fd258divffunction [ model X param_set orig_data ] = initialise_small_model( model , X , param_set ) % Initialises the observed data to be only a subset of the data and % initialises the model with the corresponding parameters. % store original things param_set.orig_test_mask = param_set.test_mask; orig_data.X = X; orig_data.model = model; % reduce observations in current model data_ind_set = zeros( size( X , 1 ) , 1 ); data_ind_set(1:param_set.data_block_size) = 1; X = X(1:param_set.data_block_size,:); model.nu = model.nu(1:param_set.data_block_size,:); param_set.test_mask = param_set.test_mask(1:param_set.data_block_size,:); % store data set ind orig_data.data_ind_set = data_ind_set; heuristics/run_search_heuristics.m0000644061573100372000000001652011177551744016404 0ustar fd258divffunction [model lower_bounds_log] = ... run_search_heuristics( model , X , lower_bounds_log , ... alpha , sigma_n , sigma_a , param_set ) % function [model lower_bounds_log] = ... % run_search_heuristics( model , X , lower_bounds_log , ... % alpha , sigma_n , sigma_a , param_set ); % % This function tries out one of several different search move % proposals, accepting them if they increase the lower bound, % rejecting them otherwise. % get K K = param_set.K; % loop for number of searches to try for search_ind = 1:param_set.search_count switch param_set.search_heuristic % --- TRY ALL PAIRS --- % case 'full' maxLB = lower_bounds_log(1,end); found_better = false; % loop over all pairs for h=1:K for l=[1:(h-1) (h+1):K] % initialise proposal new_model = model_search_move(model, h, l, ceil( rand * 5 ) ); % compute update and store results [new_model nlbl] = vibp(X, alpha, sigma_a, ... sigma_n, new_model, ... lower_bounds_log(:,end), param_set); % if the proposal is better than past ones, store it if nlbl(1,end) > maxLB + 0.1; maxLB = nlbl(1,end); best_model = new_model; best_lbl = nlbl(:,2:end); found_better = true; end end % l loop end % h loop % update if we had an improvement if found_better fprintf(' accepted full proposal\n'); model = best_model; lower_bounds_log = [lower_bounds_log best_lbl]; end % --- RANDOMLY PROPOSE --- % case 'random' % compute proposal h and l h = ceil(rand() * K); l = ceil(rand() * K); while l == h l = ceil(rand() * K); end maxLB = lower_bounds_log(1,end); found_better = false; % initialise proposal new_model = model_search_move(model, h, l, ceil( rand * 5 ) ); % compute update and store results [new_model nlbl] = vibp(X, alpha, sigma_a, sigma_n, ... new_model, lower_bounds_log(:,end), param_set); % if the proposal is better than past ones, store it if nlbl(1,end) > maxLB maxLB = nlbl(1,end); best_model = new_model; best_lbl = nlbl(:,2:end); found_better = true; end % update if we had an improvement if found_better fprintf(' accepted random proposal\n'); model = best_model; lower_bounds_log = [lower_bounds_log best_lbl]; end % --- PROPOSE BASED ON CORRELATIONS --- % case 'correlation' % compute proposal assymetric correlation cc = zeros(K); for feat_i = 1:K for feat_j = 1:K if feat_i == feat_j cc( feat_i , feat_j ) = 0; else cc( feat_i , feat_j ) = model.nu( : , feat_i )' ... * model.nu( : , feat_j ) ... / norm( model.nu( : , feat_i ) ); end end end hl = sampleMultinomial( cc(:) ); h = mod( hl , K ); if h == 0 h = K; end l = ceil(hl / K); % Now have the h,l sampled according to their correlation if h ~= l maxLB = lower_bounds_log(1,end); found_better = false; % initialise proposal new_model = model_search_move(model, h, l, ceil( rand * 5 ) ); % compute update and store results [new_model nlbl] = vibp(X, alpha, sigma_a, sigma_n, ... new_model, lower_bounds_log(:,end), param_set); % if the proposal is better than past ones, store it if nlbl(1,end) > maxLB maxLB = nlbl(1,end); best_model = new_model; best_lbl = nlbl(:,2:end); found_better = true; end % update if we had an improvement if found_better fprintf(' accepted correlation proposal\n'); model = best_model; lower_bounds_log = [lower_bounds_log best_lbl]; end end otherwise fprintf('No search moves\n'); end % end switch over heuristics end % end search loop %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % A helper function for the search moves function new_model = model_search_move(original_model, h, l, search_move) new_model = original_model; if search_move > 5 search_move = mod(search_move, 5); tmp = h; h = l; l = tmp; end switch search_move case 1 % Let h = h - l, leave l alone new_model.phi_mean(:,h) = original_model.phi_mean(:,h) - original_model.phi_mean(:,l); new_model.nu(:,h) = max(original_model.nu(:,h) - original_model.nu(:,l), 0.01); case 2 % Let h = h + l, leave l alone new_model.phi_mean(:,h) = original_model.phi_mean(:,h) + original_model.phi_mean(:,l); new_model.nu(:,h) = min(original_model.nu(:,h) + original_model.nu(:,l), 0.99); phi_cov_avg = (original_model.phi_cov(:,h) + original_model.phi_cov(:,l))/2; new_model.phi_cov(:,h) = phi_cov_avg; new_model.phi_cov(:,l) = phi_cov_avg; case 3 % Let h = h + l, delete l new_model.phi_mean(:,h) = original_model.phi_mean(:,h) + original_model.phi_mean(:,l); new_model.phi_mean(:,l) = zeros(size(original_model.phi_mean(:,l))); new_model.nu(:,h) = min(original_model.nu(:,h) + original_model.nu(:,l), 0.99); new_model.nu(:,l) = 0.01; phi_cov_avg = (original_model.phi_cov(:,h) + original_model.phi_cov(:,l))/2; new_model.phi_cov(:,h) = phi_cov_avg; new_model.phi_cov(:,l) = phi_cov_avg; case 4 % randomly reset h and l new_model.phi_mean(:,h) = randn(size(new_model.phi_mean(:,h)))*0.01; new_model.phi_mean(:,l) = randn(size(new_model.phi_mean(:,h)))*0.01; new_model.nu(:,h) = rand(size(new_model.nu(:,h))); new_model.nu(:,l) = rand(size(new_model.nu(:,h))); phi_cov_avg = (original_model.phi_cov(:,h) + original_model.phi_cov(:,l))/2; new_model.phi_cov(:,h) = phi_cov_avg; new_model.phi_cov(:,l) = phi_cov_avg; case 5 % h = h - l, l = h + l new_model.phi_mean(:,h) = original_model.phi_mean(:,h) - original_model.phi_mean(:,l); new_model.phi_mean(:,l) = original_model.phi_mean(:,h) + original_model.phi_mean(:,l); new_model.nu(:,h) = (original_model.nu(:,h) + original_model.nu(:,l))/2; new_model.nu(:,l) = (original_model.nu(:,h) + original_model.nu(:,l))/2; phi_cov_avg = (original_model.phi_cov(:,h) + original_model.phi_cov(:,l))/2; new_model.phi_cov(:,h) = phi_cov_avg; new_model.phi_cov(:,l) = phi_cov_avg; end return heuristics/update_feature_order.m0000644061573100372000000000037411177553075016200 0ustar fd258divffunction model = update_feature_order( model , feature_order , param_set ) model.tau = model.tau(:,feature_order); model.phi_mean = model.phi_mean(:,feature_order); model.phi_cov = model.phi_cov(:,feature_order); model.nu = model.nu(:,feature_order); heuristics/update_small_model.m0000644061573100372000000000223411177553107015633 0ustar fd258divffunction [ model X param_set orig_data ] = update_small_model( model , X , param_set , orig_data ) % store back into orig_data orig_data.model.tau = model.tau; orig_data.model.phi_mean = model.phi_mean; orig_data.model.phi_cov = model.phi_cov; orig_data.model.nu( find( orig_data.data_ind_set ) , : ) = model.nu; if strcmp( param_set.model_type , 'iICA' ) orig_data.model.eta( find( orig_data.data_ind_set ) , : ) = model.eta; orig_data.model.mu( find( orig_data.data_ind_set ) , : ) = model.mu; end % take out part of the model data_count = size( X , 1 ); N = size( orig_data.X , 1 ); if (data_count < N) remove_count = floor( param_set.data_block_size / 2 ); remove_ind = randsample( find( orig_data.data_ind_set == 1 ) , remove_count ); orig_data.data_ind_set( remove_ind ) = 0; add_ind = randsample( find( orig_data.data_ind_set == 0 ), param_set.data_block_size + remove_count ); orig_data.data_ind_set( add_ind ) = 1; X = orig_data.X( find( orig_data.data_ind_set ) , : ); model.nu = orig_data.model.nu( find( orig_data.data_ind_set ) , : ); param_set.test_mask = param_set.orig_test_mask( find( orig_data.data_ind_set ) , : ); end util/minimize.m0000644061573100372000000002142611177551745012427 0ustar fd258divffunction [X, fX, i] = minimize(X, f, length, varargin) % Minimize a differentiable multivariate function. % % Usage: [X, fX, i] = minimize(X, f, length, P1, P2, P3, ... ) % % where the starting point is given by "X" (D by 1), and the function named in % the string "f", must return a function value and a vector of partial % derivatives of f wrt X, the "length" gives the length of the run: if it is % positive, it gives the maximum number of line searches, if negative its % absolute gives the maximum allowed number of function evaluations. You can % (optionally) give "length" a second component, which will indicate the % reduction in function value to be expected in the first line-search (defaults % to 1.0). The parameters P1, P2, P3, ... are passed on to the function f. % % The function returns when either its length is up, or if no further progress % can be made (ie, we are at a (local) minimum, or so close that due to % numerical problems, we cannot get any closer). NOTE: If the function % terminates within a few iterations, it could be an indication that the % function values and derivatives are not consistent (ie, there may be a bug in % the implementation of your "f" function). The function returns the found % solution "X", a vector of function values "fX" indicating the progress made % and "i" the number of iterations (line searches or function evaluations, % depending on the sign of "length") used. % % The Polack-Ribiere flavour of conjugate gradients is used to compute search % directions, and a line search using quadratic and cubic polynomial % approximations and the Wolfe-Powell stopping criteria is used together with % the slope ratio method for guessing initial step sizes. Additionally a bunch % of checks are made to make sure that exploration is taking place and that % extrapolation will not be unboundedly large. % % See also: checkgrad % % Copyright (C) 2001 - 2006 by Carl Edward Rasmussen (2006-09-08). INT = 0.1; % don't reevaluate within 0.1 of the limit of the current bracket EXT = 3.0; % extrapolate maximum 3 times the current step-size MAX = 20; % max 20 function evaluations per line search RATIO = 10; % maximum allowed slope ratio SIG = 0.1; RHO = SIG/2; % SIG and RHO are the constants controlling the Wolfe- % Powell conditions. SIG is the maximum allowed absolute ratio between % previous and new slopes (derivatives in the search direction), thus setting % SIG to low (positive) values forces higher precision in the line-searches. % RHO is the minimum allowed fraction of the expected (from the slope at the % initial point in the linesearch). Constants must satisfy 0 < RHO < SIG < 1. % Tuning of SIG (depending on the nature of the function to be optimized) may % speed up the minimization; it is probably not worth playing much with RHO. % The code falls naturally into 3 parts, after the initial line search is % started in the direction of steepest descent. 1) we first enter a while loop % which uses point 1 (p1) and (p2) to compute an extrapolation (p3), until we % have extrapolated far enough (Wolfe-Powell conditions). 2) if necessary, we % enter the second loop which takes p2, p3 and p4 chooses the subinterval % containing a (local) minimum, and interpolates it, unil an acceptable point % is found (Wolfe-Powell conditions). Note, that points are always maintained % in order p0 <= p1 <= p2 < p3 < p4. 3) compute a new search direction using % conjugate gradients (Polack-Ribiere flavour), or revert to steepest if there % was a problem in the previous line-search. Return the best value so far, if % two consecutive line-searches fail, or whenever we run out of function % evaluations or line-searches. During extrapolation, the "f" function may fail % either with an error or returning Nan or Inf, and minimize should handle this % gracefully. if max(size(length)) == 2, red=length(2); length=length(1); else red=1; end if length>0, S='Linesearch'; else S='Function evaluation'; end i = 0; % zero the run length counter ls_failed = 0; % no previous line search has failed [f0 df0] = feval(f, X, varargin{:}); % get function value and gradient fX = f0; i = i + (length<0); % count epochs?! s = -df0; d0 = -s'*s; % initial search direction (steepest) and slope x3 = red/(1-d0); % initial step is red/(|s|+1) while i < abs(length) % while not finished i = i + (length>0); % count iterations?! X0 = X; F0 = f0; dF0 = df0; % make a copy of current values if length>0, M = MAX; else M = min(MAX, -length-i); end while 1 % keep extrapolating as long as necessary x2 = 0; f2 = f0; d2 = d0; f3 = f0; df3 = df0; success = 0; while ~success && M > 0 try M = M - 1; i = i + (length<0); % count epochs?! [f3 df3] = feval(f, X+x3*s, varargin{:}); if isnan(f3) || isinf(f3) || any(isnan(df3)+isinf(df3)), error(''), end success = 1; catch % catch any error which occured in f x3 = (x2+x3)/2; % bisect and try again end end if f3 < F0, X0 = X+x3*s; F0 = f3; dF0 = df3; end % keep best values d3 = df3'*s; % new slope if d3 > SIG*d0 || f3 > f0+x3*RHO*d0 || M == 0 % are we done extrapolating? break end x1 = x2; f1 = f2; d1 = d2; % move point 2 to point 1 x2 = x3; f2 = f3; d2 = d3; % move point 3 to point 2 A = 6*(f1-f2)+3*(d2+d1)*(x2-x1); % make cubic extrapolation B = 3*(f2-f1)-(2*d1+d2)*(x2-x1); x3 = x1-d1*(x2-x1)^2/(B+sqrt(B*B-A*d1*(x2-x1))); % num. error possible, ok! if ~isreal(x3) || isnan(x3) || isinf(x3) || x3 < 0 % num prob | wrong sign? x3 = x2*EXT; % extrapolate maximum amount elseif x3 > x2*EXT % new point beyond extrapolation limit? x3 = x2*EXT; % extrapolate maximum amount elseif x3 < x2+INT*(x2-x1) % new point too close to previous point? x3 = x2+INT*(x2-x1); end end % end extrapolation while (abs(d3) > -SIG*d0 || f3 > f0+x3*RHO*d0) && M > 0 % keep interpolating if d3 > 0 || f3 > f0+x3*RHO*d0 % choose subinterval x4 = x3; f4 = f3; d4 = d3; % move point 3 to point 4 else x2 = x3; f2 = f3; d2 = d3; % move point 3 to point 2 end if f4 > f0 x3 = x2-(0.5*d2*(x4-x2)^2)/(f4-f2-d2*(x4-x2)); % quadratic interpolation else A = 6*(f2-f4)/(x4-x2)+3*(d4+d2); % cubic interpolation B = 3*(f4-f2)-(2*d2+d4)*(x4-x2); x3 = x2+(sqrt(B*B-A*d2*(x4-x2)^2)-B)/A; % num. error possible, ok! end if isnan(x3) || isinf(x3) x3 = (x2+x4)/2; % if we had a numerical problem then bisect end x3 = max(min(x3, x4-INT*(x4-x2)),x2+INT*(x4-x2)); % don't accept too close [f3 df3] = feval(f, X+x3*s, varargin{:}); if f3 < F0, X0 = X+x3*s; F0 = f3; dF0 = df3; end % keep best values M = M - 1; i = i + (length<0); % count epochs?! d3 = df3'*s; % new slope end % end interpolation if abs(d3) < -SIG*d0 && f3 < f0+x3*RHO*d0 % if line search succeeded X = X+x3*s; f0 = f3; fX = [fX' f0]'; % update variables % fprintf('%s %6i; Value %4.6e\r', S, i, f0); s = (df3'*df3-df0'*df3)/(df0'*df0)*s - df3; % Polack-Ribiere CG direction df0 = df3; % swap derivatives d3 = d0; d0 = df0'*s; if d0 > 0 % new slope must be negative s = -df0; d0 = -s'*s; % otherwise use steepest direction end x3 = x3 * min(RATIO, d3/(d0-realmin)); % slope ratio but max RATIO ls_failed = 0; % this line search did not fail else X = X0; f0 = F0; df0 = dF0; % restore best point so far if ls_failed || i > abs(length) % line search failed twice in a row break; % or we ran out of time, so we give up end s = -df0; d0 = -s'*s; % try steepest x3 = 1/(1-d0); ls_failed = 1; % this line search failed end end util/v2pi.m0000644061573100372000000000022311177551745011456 0ustar fd258divffunction piSet = v2pi( vSet ) % function piSet = v2pi( vSet ) % converts stick breaks to feature probabilities piSet = cumprod( vSet ); return vibp_base/compute_variational_lower_bound.m0000644061573100372000000000755211177553206020225 0ustar fd258divffunction [lower_bound] = compute_variational_lower_bound(... params, X, alpha, sigma_a, sigma_n, model) % function [lower_bound] = compute_variational_lower_bound(... % params, X, alpha, sigma_a, sigma_n, model ) % % Computes the variational lower bound on the log marginal likelihood % of the data based on the variational parameters. % Get parameters and constants. nu = model.nu; tau = model.tau; phi_mean = model.phi_mean; phi_cov = model.phi_cov; [N K] = size(nu); D = size(phi_mean, 1); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The feature probabilities digamma_sum = sum(psi(tau(1,:)) - psi(sum(tau,1))); if params.use_finite_model lower_bound = K*log(alpha/K) + (alpha/K-1)*digamma_sum; else lower_bound = K* log(alpha) + (alpha-1)*digamma_sum; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The feature stats distribution feature_stat_bound = 0; if params.use_finite_model feature_stat_bound = sum( nu * psi( tau( 1 , : ) )' ) ... + sum( ( 1 - nu )*psi(tau(2,:) )' )... - sum( psi( tau( 1 , : ) + tau( 2 , : ) )' )*N; else for k = 1:K % This computes the expecation of log(1-prod v_i) by % introducing the variational multinomial distribution. % See the paper for details. f = compute_expected_pzk0_qjensen( tau , k ); feature_stat_bound = feature_stat_bound ... + sum(nu(:,k)) * sum( psi( tau(1,1:k) ) - psi( sum(tau(:,1:k)) ) ) ... + (N - sum(nu(:,k))) * f; end end lower_bound = lower_bound + feature_stat_bound; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The feature distribution feature_dist_bound = sum(phi_cov(:)) + sum(dot(phi_mean, phi_mean)); lower_bound = lower_bound - D*K/2*log(2*pi*sigma_a^2) - 1/(2*sigma_a^2)* feature_dist_bound; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The likelihood X = X .* params.test_mask; % mask out x's that are test data likelihood_bound = sum( sum( X.^2 ) ); switch params.model_type % linear gaussian case case 'LG' likelihood_bound = likelihood_bound - 2 * sum( sum( nu .* ( X * phi_mean ) )); % first term sums up the number of dimensions present in each % data point for the phi_cov, since this is different for % each; second term computes dot(phi_mean,phi_mean) but again, % for each n, sums only the terms in the dot product with % dimensions present for that n. tmp = params.test_mask * ( phi_cov + phi_mean.^2 ); likelihood_bound = likelihood_bound + sum(sum( nu .* tmp )); % we do sum_n sum_d sum_k sum_k' over all variables and then % subtract out the k=k' part that we do not want; first line % does sum_k nu_nk phi_kd; next we do mask_nd nuphi_nd % nuphi_nd tmp = nu * phi_mean'; tmp2 = nu.^2 * (phi_mean').^2; likelihood_bound = likelihood_bound ... + sum(sum( params.test_mask .* tmp .* tmp )) ... - sum(sum( params.test_mask .* tmp2 )); end lower_bound = lower_bound - sum( params.test_mask(:) )/2*log(2*pi*sigma_n^2) - 1/(2*sigma_n^2)* likelihood_bound; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The entropy % Add in a factor for pi/v as appropriate. entropy_bound = sum( gammaln(tau(1,:)) + gammaln(tau(2,:)) ... - gammaln(sum(tau,1)) - (tau(1,:)-1) .* psi(tau(1,:)) - ... (tau(2,:)-1) .* psi(tau(2,:)) + (tau(1,:) + tau(2,:) - 2) .* psi(sum(tau,1)) ); % Add in a factor for the nonzero nu(n,k) tmpnu = nu + .5 * ( nu == 0 ) - .5 * ( nu == 1 ); tmp = -1 * tmpnu .* log( tmpnu ) - ( 1 - tmpnu ).* log( 1 - tmpnu ); tmp = tmp .* ( nu > 0 ) .* ( nu < 1 ); entropy_bound = entropy_bound + sum( tmp(:) ); entropy_bound = entropy_bound + K*D/2*log(2*pi*exp(1)); entropy_bound = entropy_bound + 1/2*sum(log( phi_cov(:) )); lower_bound = lower_bound + entropy_bound; return vibp_base/vibp.m0000644061573100372000000001242711177553331012515 0ustar fd258divffunction [model, lower_bounds_log, num_iters] = ... vibp(X, alpha, sigma_a, sigma_n, model, lower_bounds_log, params) % function [model, lower_bounds_log] = ... % vibp(X, alpha, sigma_a, sigma_n, model, lower_bounds_log, params) % % Performs the variational updates for the finite model and keeps % track of the lower bound. % Get parameters and constants. nu = model.nu; tau = model.tau; phi_mean = model.phi_mean; phi_cov = model.phi_cov; [N K] = size(nu); D = size(phi_mean, 1); lower_bound_count = 0; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Perform the variational updates. iter = 1; while 1 % Update all the parameters. % Note that the order of updates might affect speed of % convergence, so we can play with that here. if params.vary_update_order param_update_perm = randperm(4); else param_update_perm = 1:4; end for param_index = param_update_perm switch param_index case 1 %%%%%%%%%%%%%%%%%%% % Update all tau [ tau lower_bounds_log lower_bound_count ] = ... update_tau( params, X, alpha, sigma_a, sigma_n, model, ... lower_bounds_log, lower_bound_count ); model.tau = tau; case 2 %%%%%%%%%%%%%%%%%%% % Update all phi for k = 1:K non_k_indices = [1:(k-1) (k+1):K]; % First compute the B and C matrices as outlined in the notes. switch params.model_type case 'LG' B = (1/sigma_a^2 + params.test_mask' * nu(:,k) /sigma_n^2); C = (X .* params.test_mask)' * nu(:,k); for n = 1:N % multiply in mask with the phi_mean is sufficient to zero out the % dot product with A in the expectation C = C - nu(n,k) * ( sum( nu(n*ones(D,1), non_k_indices) ... .* phi_mean(:, non_k_indices),2) .* params.test_mask(n,:)' ); end C = C/ sigma_n^2; end % Then based on B and C, update phi. phi_cov(:,k) = 1./B; phi_mean(:,k) = C ./ B; model.phi_mean = phi_mean; model.phi_cov = phi_cov; % Update our lower bound if params.compute_intermediate_lb lower_bounds_log(1,end+1) = ... compute_variational_lower_bound(params, X, alpha, params.sigma_a, params.sigma_n, model); lower_bounds_log(2,end) = 2; lower_bound_count = lower_bound_count + 1; end end case 3 %%%%%%%%%%%%%%%%%%% % Update all nu for k = 1:K % First compute var_theta as outlined in the notes. var_theta_k % is the term shared across nu(:,k) so that we don't recompute it. % Minimal value for LG models var_theta_k = compute_var_theta_k( params , tau , k ); non_k_indices = [1:(k-1) (k+1):K]; for n = randperm(N) switch params.model_type case 'LG' var_theta = var_theta_k ... - 1/(2*sigma_n^2) * ( params.test_mask(n,:) * phi_cov(:,k) ... + params.test_mask(n,:) * (phi_mean(:,k).^2))... + 1/(sigma_n^2) * ( params.test_mask(n,:) .* phi_mean(:,k)' ) * ... (X(n,:)' - sum(nu(n*ones(D,1), non_k_indices) .* phi_mean(:, non_k_indices),2)); end % Once we have var_theta, go from the canonical parameter to the % mean parameter. nu(n,k) = 1 / (1 + exp(-var_theta)); model.nu = nu; % Update our lower bound if params.compute_intermediate_lb lower_bounds_log(1,end+1) = ... compute_variational_lower_bound( params, X, alpha, parmas.sigma_a, params.sigma_n, model); lower_bounds_log(2,end) = 3; lower_bound_count = lower_bound_count + 1; end end end end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% lower_bounds_log(1,end+1) = ... compute_variational_lower_bound( params, X, alpha, params.sigma_a, params.sigma_n, model); lower_bounds_log(2,end) = 5; lower_bound_count = lower_bound_count + 1; % Determine stopping criteria and stop if we have converged or % done enough iterations. if (lower_bounds_log(1,end) - lower_bounds_log(1,end-lower_bound_count) ... < params.stopping_thresh) || (iter >= params.vibp_iter_count) break; end iter = iter + 1; end num_iters = iter; vibp_finite/compute_var_theta_k.m0000644061573100372000000000034611167120111016122 0ustar fd258divffunction var_theta_k = compute_var_theta_k( params , tau , k ) % function var_theta_k = compute_var_theta_k( params , tau , k ) % an intermediate function in computing the nu update var_theta_k = psi(tau(1,k)) - psi(tau(2,k)); vibp_finite/update_tau.m0000644061573100372000000000155211167120111014232 0ustar fd258divffunction [ tau lower_bounds_log lower_bound_count ] = ... update_tau( params, X, alpha, sigma_a, sigma_n, model, ... lower_bounds_log, lower_bound_count ); % function [ tau lower_bounds_log lower_bound_count ] = ... % update_tau( params, X, alpha, sigma_a, sigma_n, model , ... % lower_bounds_log, lower_bound_count ); [ N K ]= size( model.nu ); tau = model.tau; % loop through and update each tau for k = 1:K sum_nu_k = sum( model.nu(:,k)); tau(1,k) = alpha/K + sum_nu_k; tau(2,k) = 1 + N - sum_nu_k; model.tau = tau; % Update our lower bound if params.compute_intermediate_lb lower_bounds_log(1,end+1) = ... compute_variational_lower_bound( params, X, alpha, sigma_a, sigma_n, model ); lower_bounds_log(2,end) = 1; lower_bound_count = lower_bound_count + 1; end end vibp_infinite/compute_expected_pzk0_qjensen.m0000644061573100372000000000125511177551745020500 0ustar fd258divffunction [lb q] = compute_expected_pzk0_qjensen( tau , k ) % function [lb q] = compute_expected_pzk0_qjensen( tau , k ); % computes E[ ln( 1 - prod( 1 - v ) ) ] where the prod is from 1..k % using q-distribution described by YWT % select our relevant set of tau's (up to k) tau = tau( : , 1:k ); digamma_tau = psi( tau ); digamma_sum = psi( sum( tau ) ); % compute the optimal q distribution digamma_tau1_cumsum = [ 0 cumsum( digamma_tau( 1 , : ) ) ] ; digamma_sum_cumsum = cumsum( digamma_sum ); tmp = digamma_tau( 2 , : ) + digamma_tau1_cumsum(1:k) - digamma_sum_cumsum; q = exp( tmp - max(tmp) ); q = q / sum(q); % compute the lb lb = sum( q .* ( tmp - log( q ) ) ); return vibp_infinite/compute_var_theta_k.m0000644061573100372000000000046211177551745016476 0ustar fd258divffunction var_theta_k = compute_var_theta_k( params , tau , k ) % function var_theta_k = compute_var_theta_k( params , tau , k ) % an intermediate function in computing the nu update f = compute_expected_pzk0_qjensen( tau, k ); var_theta_k = sum( psi( tau(1,1:k) ) - psi( sum(tau(:,1:k)) ) ) - f; return vibp_infinite/update_tau.m0000644061573100372000000000276711177551745014620 0ustar fd258divffunction [ tau lower_bounds_log lower_bound_count ] = ... update_tau( params, X, alpha, sigma_a, sigma_n, model , ... lower_bounds_log, lower_bound_count ) % function [ tau lower_bounds_log lower_bound_count ] = ... % update_tau( params, X, alpha, sigma_a, sigma_n, model , ... % lower_bounds_log, lower_bound_count ) nu = model.nu; [ N K ] = size( nu ); sum_n_nu = sum(nu,1); N_minus_sum_n_nu = N - sum_n_nu; % Iterate through and update each tau(:,k) for k = 1:K % First we compute q_k for k:K tau = model.tau; digamma_tau = psi( tau ); digamma_sum = psi( sum( tau ) ); digamma_tau1_cumsum = [ 0 cumsum( digamma_tau( 1 , 1:(K-1) ) ) ] ; digamma_sum_cumsum = cumsum( digamma_sum ); exponent = digamma_tau( 2 , : ) + digamma_tau1_cumsum - digamma_sum_cumsum; unnormalized = exp(exponent - max(exponent)); qs = zeros(K,K); for m = k:K qs(m, 1:m) = unnormalized(1:m) / sum(unnormalized(1:m)); end % Now that we have the q_k, update the tau(:,k) tau(1,k) = sum(sum_n_nu(k:K)) + N_minus_sum_n_nu(k+1:K) * ... sum(qs(k+1:end, k+1:end),2) + alpha; tau(2,k) = N_minus_sum_n_nu(k:K) * qs(k:K, k) + 1; % Finally commit the udpated tau and compute the lower bound if desired. model.tau(:,k) = tau(:,k); if params.compute_intermediate_lb lower_bounds_log(1,end+1) = ... compute_variational_lower_bound( params, X, alpha, sigma_a, sigma_n, model); lower_bounds_log(2,end) = 1; lower_bound_count = lower_bound_count + 1; end end return