function [q,o2,o3] = reduce(p,type,varargin)
%
% q = reduce(p,'type',[options]) --  "Reduce" a KDE, so that it requires fewer 
%   kernels but has similar representative power (better than just resampling)
%
% 'type' is one of:
%     'mscale' -- "Multiscale data condensation" method of Mitra et al. PAMI '02
%                  Selects retained points based on k-nearest-neighbor distances
%           [options] = k (param of k-nn); controls degree of density reduction
%           Notes: Not a very good (fast) implementation, as of yet.
%                  Method does not make use of KDE's bandwidth
%                  Method fails to account for KDEs with non-uniform weights
%
%     'rsde'   -- "Reduced set dens. est" method of Girolami & He PAMI '03
%                   Minimizes an ISE (integrated squared error) criterion by 
%                   solving a QP to induce sparsity among kernel weights.
%           [option1] = QP solution method, one of:
%               'smo'  --  Sequential Minimal Optimization (default)
%               'mult' --  Multiplicative Updating
%               'qp'   --  Standard quadratic programming (Matlab Optim. Toolbox)
%           [option2] = ISE estimate method (default exact eval); see ise.m for more options
%           Notes: The underlying implementation and quadratic solvers are
%                 adapted directly from Chao He and Mark Girolami's code; see 
%                 their website for more detail
%
%    'em'      -- use Expectation-Maximization to find a (diagonal) GM approx
%           [options] = k, the number of mixtures in output
%
%    'grad'    -- Simple K-L gradient ascent on kernel centers, holding kernel
%                 size fixed (chosen using ROT's heuristic).
%           [options] = N, the number of kernel centers to retain
%                       klMethod; see klGrad for options (e.g. 'LLN' or 'ABS')
%
%    'kdtree'  -- KD-tree based reduction method of Ihler et al.
%           [options]  maxCost (double)/maxN (uint32), 
%                      errType = {'maxlog', 'ise', 'kld'}
%
% See also: kde, resample

% Copyright (C) 2003 Alexander Ihler; distributable under GPL -- see README.txt

if (nargin < 2) type = 'rsde'; end;
%fprintf('Reducing KDE : %d points => ',getNpts(p));
switch (lower(type))
  case 'rsde', q=reduceRSDE(p,varargin{:});
  case 'grad', q=reduceGrad(p,varargin{:});
  case 'em',   q=reduceEM(p,varargin{:});
  case 'mscale', q=reduceMScale(p,varargin{:});
  case 'kdtree', [q,o2,o3]=reduceKD(p,varargin{:});
  otherwise, error('Unknown type for reduce!');
end;
%fprintf('%d points (%f effective)\n',getNpts(q),getNeff(q));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function q=reduceGrad(p,N,klMethod)
  if (nargin < 3) klMethod = 'LLN'; end;
  ks = getBW(ksize(p,'rot'),1);   % find ROT ksize for new kernel locations using
  dim = getDim(p);                % old ROT & adjusting for new # of kernels
  ks = ks * (N/getNpts(p))^(-1/(4+dim));
  q = resample(p,N);              % init to something at random
  adjustBW(q,ks);                 % set the BW to above value
  tol = 1e-4;
  alpha = .01; err2OLD = zeros(dim,N); err2 = [1];
  while (alpha * max(max(err2)) > 1e-5),  % and start doing grad. ascent
    [err1, err2] = klGrad(p,q,klMethod);
    adjustPoints(q,-alpha*err2);          % Use self-adjusting rate
    if (min(err2 .* err2OLD)>=0) alpha = alpha/.95;
    else alpha = alpha/1.4142; end;
    err2OLD = err2;
  end;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function q=reduceMScale(p,k)
  pts = getPoints(p);          
  [nn,r] = knn(p,pts,k+1);
  [rsort,ind] = sort(r);
  keep = [];
  while (length(ind))
    keep = [keep,ind(1)];
    outside = find( sqrt(sum((repmat(pts(:,ind(1)),[1,length(ind)])-pts(:,ind)).^2,1)) > 2*rsort(1) );
    ind = ind(outside);
  end;
  q = kde(pts(:,keep),'rot');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function q=reduceRSDE(p,type,isetype)
  global kdeReduceType;
  global kdeISEType;
  if (nargin < 2) type = 'smo'; end;               % Default to seq. min. optimization
  if (nargin < 3) isetype = 0; end; kdeISEType = isetype; % and exact ISE
  switch (lower(type)),
    case 'qp', kdeReduceType = 1;
    case 'smo', kdeReduceType = 2;
    case 'mult', kdeReduceType = 3;
    otherwise, error('Unknown reduce RSDE solve method!');
  end;

  if (p.type ~= 0) 
    error('Sorry! This method only supports Gaussian kernels currently.'); 
    % Non-gaussian kernels : convolution operator below is harder. 
  end;

  [minm,maxm] = neighborMinMax(p);            % Search over bandwidths:
  ks =  golden(p,@quality,2*minm/(minm+maxm),1,2*maxm/(minm+maxm),1e-2);
  q = reduceKnownH(p,ks,kdeReduceType);

function res = quality(h,p)             % Evaluate quality using ISE estimate
  global kdeReduceType;                 %  (note, the minimum given by ISE
  global kdeISEType;                    %   is actually a pretty good  
  q = reduceKnownH(p,h,kdeReduceType);  %   estimate of the true min of KL)
  res = ise(p,q,kdeISEType);            %
                                        % 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function pnew = reduceKnownH(p,alpha,type)
  pts = getPoints(p); N = getNpts(p); d = getDim(p);
  q = kde(p); q.bandwidth = q.bandwidth * alpha; % For a given bandwidth "h"
  D=evaluate(q,p); Q = gramConvolved( q );       % find w : min w*Q*w' + 2*w*D'
  if (type == 1) newWts=quadprog(Q,D,-eye(N),zeros(1,N),ones(1,N),1);
  else newWts = reduceSolve(Q,D,type);           %   via Quadratic Optimization
  end;
  if (size(q.bandwidth,2)> 2*N) BW = getBW(q,find(newWts));
  else BW = getBW(q,1); 
  end;
  pnew = kde(pts(:,find(newWts)),BW,newWts(find(newWts)));

function [minm,maxm] = neighborMinMax(npd)    % Use this to determine the searching
  maxm = sqrt(sum( (2*npd.ranges(:,1)).^2) ); %  range for valid "h"
  minm = min(sqrt(sum( (2*npd.ranges(:,1:npd.N-1)).^2 ,1)),[],2);
  minm = max(minm,1e-6);

function G = gramConvolved(p)                 % Compute Q_ij = \int K(x,xi)K(x,xj)
  pts = getPoints(p); N = getNpts(p); d = getDim(p);
  G = zeros(N);
  for i=1:N
    dummy=pts(:,i:end)-repmat(pts(:,i),1,N-i+1);
    BW = getBW(p,i:N).^2 + repmat(getBW(p,i).^2,[1,N-i+1]);
    tmp = -0.5*sum(dummy.^2./BW + log(BW) ,1);
    G(i,i:N) = tmp; G(i:N,i) = tmp';
  end
  G = exp(G) ./ (2*pi)^(d/2);
  
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function q=reduceEM(p,k)
  pts = getPoints(p); N = size(pts,2); d = size(pts,1);
  plt = -1:.01:4; 
  % check to verify that BW is uniform... !!!
  if (size(p.bandwidth,2)>2*N)
    error('Reduce: EM: only works with uniform bandwidths...');
  else BW = getBW(p,1);
  end;

  % create random assortmend of "k" diagonal covar Gaussians
  %initInd = randperm(N); initInd=initInd(1:k);
  %for i = 1:k, GM{i} = kde(pts(:,initInd(i)),BW); end;
  for i = 1:k, GM{i} = kde(sample(p,1),BW); end;
  converged=0; mean=zeros(d,k); var=zeros(d,k); wts=zeros(k,N);
  while (~converged)
    converged = 1; meanOld = mean; varOld = var;
  %   find relative weights of all points for each mixture component
    for i=1:k, wts(i,:) = evaluate(GM{i},pts); end;
    wts = wts ./ repmat(sum(wts,1),[k,1]); % normalize
    for i=1:k,  %   compute conditional mean & variance & update
      mean(:,i) = pts*wts(i,:)' ./ sum(wts(i,:)); 
      ptsM = pts - repmat(mean(:,i),[1,N]);
      var(:,i) = ptsM.^2 * wts(i,:)' ./ sum(wts(i,:));
      var(:,i) = var(:,i) + BW.^2;     % adjust for smoothing factor...
      if (norm(mean(:,i)-meanOld(:,i)) > 1e-5) converged = 0; end;
      if (norm(var(:,i)-varOld(:,i)) > 1e-5) converged = 0; end;
      GM{i} = kde(mean(:,i),sqrt(var(:,i)) );
    end;
  end
  % combine & convolve (add variance / already added) of fine-scale BW
  %q = kde(mean,sqrt(var + repmat(BW.^2,[1,k])),sum(wts,2)');
  q = kde(mean,sqrt(var),sum(wts,2)');