%DTC Verzakov Tree - Trainable Decision Tree Classifier
% 
%   W = DTC(A, CRIT, CHISQSTOPVAL, PRUNE, T, NFEAT)
% 
% INPUT
%   A       Training dataset. 
%           Object weights and class priors:
%             It is possible to assign individual weights to dataset
%             objects by using 'weights' identifier:
%
%             A = SETIDENT(A, WEIGHTS, 'weights')
%
%             If weights are not defined they assumed to be equal to 1 for
%             all objects. The actual weights used by the training routine
%             are computed by using supplied weights along with the class 
%             priors (if priors are not set then the apparent priors are used):
%
%             ACTUAL_WEIGHTS(I) = M*(WEIGHTS(I)/CW(C))*(PRIOR(C)/SUM(PRIOR))
%
%             where M is the total amount of (labelled) objects in A, 
%             CW is the vector weights sums for each class, and C is
%             the class of the object I. The sum of all actual weights is M.
%
%           Feature types:
%             Features are treated diffrently based on their domain information.
%             If the feature domain is empty or is the interval/collection of intervals  
%             then this feature is considered to be the continuous one and
%             branches are created by splitting at the threshold value.
%             Otherwise (feature domain specifies a set of values or set of
%             names), the feature is considered to be the nominal one and
%             branches corresponding to all present feature values are
%             created.
%
%           Unknown and 'non-applicable' values:
%           If the feature value is NaN then it is assumed that its value
%           is unknown. In such a situation the object with unknown value
%           is split into fractions and sent down all branches.
%           The concept of the 'non-applicable' feature value is different
%           from the concept of the unknown (missing) value. 
%           ...
%           The 'non-applicable' values for the continues features are
%           encoded as INF. If feature domain is the (set of) interval(s), 
%           then INF value has to be explicitly added to the domain
%           defintion. The 'non-applicable' value of nominal features
%           does not have predefined encoding. If it is necessary user
%           have to include such value into domain definition.
%
%   CRIT   Splitting citerion name.
%          'igr' Information Gain Ratio (default):
%          As defined by Quinlan. The penalty on the number of the distinct
%          values of the continues feature is used. If the gain is zero or
%          negative due to such penalization, the split is not performed.
%          This leads to smaller trees and may give non-zero training error.
%          This criterion does not use costs. (Costs are used only at the classification step).
%          
%          'gini' Gini impurity index. More precisely, the change in this
%          index. GINI index can be interpreted as a misclassification rate
%          for the stochastic prior based classifier, so costs are
%          naturally embedded. If the change in the (absolute) error less 
%          or equal to 0.1 (change in the cost less or equal to 0.1 of minimal 
%          absolute value of non-zero costs) the split is not performed.
%          This leads to smaller trees and may give non-zero training error.
%
%          'miscls' Classification error criterion.
%          To be used only for educational puposes because  
%          it gives rather inferior results. Costs are naturally embedded. 
% 
%   CHISQSTOPVAL Early stopping crtitical value for the chi-squared test 
%          on the difference between the original node class distribution and
%          branches class distributions. CHISQSTOPVAL is 0 by default. 
%          Which means that branches will be discarded only if they bring no
%          change in class distribution.
%
%   PRUNE  Pruning type name
%          'prunep' - pessimistic (top-down) pruning as defined by Quinlan. 
%          Pessimistic pruning be perfromed if cost matrix is defined.
%          'prunet' - test set (bottom-up) pruning using the (required)
%          dataset T.
%          These implementations of both pruning algorithms may be not
%          exactly correct if there are unknown values in datastes.
%
%   T      Test test for the test set pruning
%
% OUTPUT
%   W      Classifier mapping
%
% DESCRIPTION
%    If (full grown) branches of (sub)tree do not improve classification error 
%    (misclassification cost) they are immediatley discarded. 
%    This may happen because we use regularized posteriors. As a result
%    the algorithm is more stable, trees are smaller, but split on
%    the training set may be not perfect.
%
% REFERENCES
% [1] J.R. Quinlan, Simplifying Decision Trees, International Journal of 
%     Man-Machine Studies, 27(3), pp. 221-234, 1987.
% [2] J.R. Quinlan, Improved use of continuous attributes in C4.5. Journal
%     of AI Research, 4(96), pp. 77-90, 1996.
%
% see also DATASETS, MAPPINGS, TREEC

% Copyright: S. Verzakov, s.verzakov@gmail.com
% Based on prtools' treec, tree_map, and their subroutines
% by Guido te Brake and R.P.W Duin

function w = dtc(a, crit, chisqstopval, prune, t, nfeat)

		% When no input data is given, an empty tree is defined:
	if (nargin == 0) || isempty(a)
    
    if nargin < 2 || isempty(crit)
			crit = 'igr';
    end
    
    if nargin < 3 || isempty(chisqstopval)
      chisqstopval = 0;
    end
    
    if nargin < 4 || isempty(prune)
      prune = '';
    end
    
    if nargin < 5
      t = [];
    end
    
    if nargin < 6
      nfeat = [];
    end
    
    w = prmapping('dtc', {crit, chisqstopval, prune, t, nfeat});
    w = setname(w, ['DecTree' upper(crit)]);
	
  elseif (nargin == 2) && ismapping(crit)
    % Execution
    w = crit;    
    tree = +w;
    
    % B contains posteriors.
    % We do not need to convert posteriors to costs.
    % PRTools will do it automatically

    b = mapdt(tree, +a);
    b = setdat(a,b,w);
%    b = setfeatlab(b, getlabels(w));
    
    w = b;
  
  else
    % Training
	
    if nargin < 2 || isempty(crit)
      crit = 'igr'; 
    end

    if nargin < 3 || isempty(chisqstopval)
      chisqstopval = 0; 
    end
    
    if nargin < 4 || isempty(prune)
      prune = ''; 
    end

    if nargin < 5
      t = []; 
    end
    
    if nargin < 6
      nfeat = []; 
    end
    
    
    if ~any(strcmpi(crit, {'igr', 'gini', 'miscls'}))
      error('Unknown splitting criterion');
    end

    islabtype(a,'crisp');
    isvaldfile(a, 1, 2); % at least 1 object per class, 2 classes
    a = seldat(prdataset(a)); % get rid of all unlabelled objects
    
    m = size(a,1);
    % Get weights (if defined)
    weights = getident(a, 'weights');
    if isempty(weights)
      weights = ones([m 1]);
    else
     idx = (weights > 0);
     if nnz(idx) < m
       a = a(idx, :);  
       weights = weights(idx);
     end
     isvaldset(a, 1, 2); 
    end
      
    % First get some useful parameters:
    
    % Sizes
    [m, k, c] = getsize(a);
    cs = classsizes(a);

    % Determine if features are categorical
    featdom = getfeatdom(a);
    featdom = featdom(:)';
    if isempty(featdom)
      feattype = zeros(1, k);
    else
      feattype = cellfun(@(x) ischar(x) || (size(x,1)==1), featdom);
    end
    
    % Features to use
    usefeat = true([1, k]);
    if ~isempty(nfeat)
      nfeat = min(nfeat, k);
    end

    % Labelling
    nlab = getnlab(a);
    
    % Get priors
    prior = getprior(a);
    prior = prior/sum(prior);
    
    % Define weights:
    % the total sum of weights is equal to m
    % Compute class weights
    cw = zeros(size(cs));
    for i=1:c
      cw(i) = sum(weights(nlab == i));
    end
    % Rescale objects weights
    % by class weight factors derived from priors
    cwf = m*prior./cw;
    weights = weights.*(cwf(nlab)).';

    % Miscalssification cost
    cost = a.cost;

    % Now the training can really start:
    tree = makedt(+a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit, chisqstopval);
    
    if ~isempty(prune)
      if strcmpi(prune, 'prunep')
        if ~isempty(cost)
          error('Pessimistic prunning based on misclassification costs is not implemented');
        end
        tree = prunep(tree);
      elseif strcmpi(prune, 'prunet')
        if isempty(t)
          error('Test set is not specified for the test set prunning');
        end
         tree = prunet(tree, t, cost);
      else
        error('Unkown prunning method');        
      end
      
      tree = cleandt(tree);
    end
    

    % Store the results:
    w = prmapping('dtc', 'trained', {tree}, getlablist(a), k, c);
    w = setname(w, ['DecTree' upper(crit)]);
    w = setcost(w, cost);
  end
  
  return

%MAKEDT General tree building algorithm
% 
%   TREE = MAKEDT(A, FEATTYPE, USEFEAT, NFEAT, C, NLAB, WEIGHTS, COST, CRIT, CHISQSTOPVAL)
% 
% INPUT
%   A      Data matrix
%
%   FEATTYPE Row defining the feature types (0 - continues, 1 - nominal)    
%
%   USEFEAT Row defining the features to be used for splitting
%
%   NFEAT  The maximum number of features for which splitting has to be
%          attempted. If the number of the available features is geater
%          than NFEAT the random subset of NFEAT features will be used for
%          splitting.
%
%  C       Total number of classes in the initial dataset
%
%  NLAB    Numeric class labels of objects in A
%
%  WEIGHTS Weights of objects in A
%  
%  COST    Misclassification costs
%
%  CRIT    Name of splitting criterion to be used
%
%  CHISQSTOPVAL Crtitical value for the chi-squared test
%
% OUTPUT
%   TREE   Structure containing arrays defining decision tree.
%          .NSMP The number of samples. NSMP(J) is the sum of weights of objects 
%          which reached the node J.
%
%          .POST Posterior probabilities. POST(J, :) is the class
%          distribution at the node J. Object weights are taken into
%          account. 0 and 1 probablities are avoided by perfroming Bayes
%          uniform priors regularization.
%          
%          .CIDX Class index. CIDX(J) is the class corresponding to the
%          node J if it considered as a leaf.
%
%          .ERRL Leaf error. ERRL(J) is the misclassification error (cost) 
%          on training samples which reached the node J if this node is
%          considered to be a leaf.
%
%          .ERRT Leaf error. ERRT(J) is the misclassification error (cost) 
%          on training samples which reached the node J if this node is
%          considered to be a root of the (sub)tree.
%
%          .SIZE Tree size. SIZE(J) is the (sub)tree size with root at the
%          node J.
%
%          .FIDX Feature index. FIDX(J) is the index of the feature on
%          which node J is split. FIDX(J) == 0 means that J is a leaf.
%        
%          .FVAL Feature value(s). For nominal features FVAL{J} is the set
%          of feature values observed at the node J (LENGTH(FVAL{J} == 0 
%          for the leaf, otherwise it is > 1).
%          For continues features, if LENGTH(FVAL{J}) == 1 then it contains
%          threshold THR for splitting into the left (<= THR) and the
%          right (> THR) branches. If FVAL{J} == [] (and J is not a leaf) 
%          then it means that split is perfromed between applicable and 
%          non-applicable values. 
%         
%          .NIDX Branch node indices. For nominal features NIDX{J,K} is
%          the index of branch node of node J with value FVAL{J,K} 
%          of feature FIDX(J). For continues features (if LENGTH(FVAL{J}) == 1) 
%          NIDX{J,1} is the index of the left branch node, NIDX{J,2} is the 
%          index of the right branch node. If FVAL{J} == [] 
%          (and J is not a leaf) then NIDX{J,1} is the index of branch node
%          containing objects with applicable values of feature FIDX(J) and
%          NIDX{J,2} is the index of branch node containing objects with 
%          non-applicable values of the same feature.
% 
% This is a low-level routine called by DTC.
% 
% See also IGR, GINI, MISCLS

function tree = makedt(a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit, chisqstopval) 
	
  	
  % Construct the tree:    
  
  % Find (absolute) class frequencies
  C = zeros(1, c);
  for j=1:c
    C(j) = sum(weights(nlab == j)); 
  end
  
  NC = nnz(C);
  tree.nsmp = sum(C);
  
  % regularization by 'uniform' Bayesian priors;
  C0 = C;
  C = C + 1;
  p = C/sum(C);
  
  if isempty(cost)
    [maxpost, cidx] = max(p);
    errc = tree.nsmp*(1-maxpost);
    %errc = tree.nsmp - C0(cidx);
  else
    costp = p*cost;
    [mincost, cidx] = min(costp);
    errc = mincost;    
  end
  
  tree.post = p;
  tree.cidx = cidx;
  tree.errl = errc;
  tree.errt = errc;
  tree.size = 1;

  if NC ~= 1 % not a pure class dataset
    % now the tree is recursively constructed further:
		% use desired split criterion
    [fidx, fval, nb, bidx, chisqval] = findsplit(+a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit);
    
    % When the stop criterion is not reached yet, we recursively split
		% further:
    if ~isempty(fidx) && (chisqval > chisqstopval)
      tree.fidx = fidx;
      tree.fval = {fval};

      if feattype(fidx) > 0 
        usefeat(fidx) = 0;
      end
      
      uidx = bidx == 0;
      nu = nnz(uidx);
      if nu > 0
        knsmp = tree.nsmp - sum(weights(uidx));
      end
      
      tree.nidx = {zeros(1, nb)};
      tree.errt(1) = 0; 
      for j=1:nb
        tree.nidx{1}(j) = tree.size(1) + 1;
        
        J = bidx == j;
        if nu == 0
          bweights = weights(J);
        else
          J = J | uidx;
          bweights = weights(J);
          buidx = uidx(J);
          bknsmp = sum(bweights(~buidx)); 
          bweights(buidx) = (bknsmp/knsmp)*bweights(buidx);
        end
        
        branch = makedt(+a(J, :), feattype, usefeat, nfeat, c, nlab(J), bweights, cost, crit, chisqstopval);
        
        branch.nidx = cellfun(@(x) x + tree.size(1)*(x>0), branch.nidx, 'UniformOutput', false);
        tree.errt(1) = tree.errt(1) + branch.errt(1);
        tree.size(1) = tree.size(1) + size(branch.nidx, 1);

        tree.nsmp = [tree.nsmp; branch.nsmp];
        tree.post = [tree.post; branch.post];
        tree.cidx = [tree.cidx; branch.cidx];
        tree.errl = [tree.errl; branch.errl];
        tree.errt = [tree.errt; branch.errt];
        tree.size = [tree.size; branch.size];          
        tree.fidx = [tree.fidx; branch.fidx];
        tree.fval = [tree.fval; branch.fval];
        tree.nidx = [tree.nidx; branch.nidx];
      end
    end
  end
  
  % no improvement in error (cost), rollback
  if (tree.size(1) > 1) && (tree.errt(1) >= tree.errl(1))
    tree.nsmp = tree.nsmp(1);
    tree.post = tree.post(1,:);    
    tree.cidx = tree.cidx(1);        
    tree.errl = tree.errl(1);    
    tree.errt = tree.errl(1); % sic!
    tree.size = 1;
  end  

  if tree.size(1) == 1
    % We reached the stop criterion or no further split is possible
    % so we make a leaf node:
    tree.fidx = 0;
    tree.fval = {[]};
    tree.nidx = {[]};
  end
	
	return

%MAPDT Tree mapping and node statistic calculation
% 
% 	[P, S] = MAPDT(TREE, A, NLAB, WEIGHTS)
% 

function [p, s] = mapdt(tree, a, nlab, weights)
    
  persistent dt st
  
  if isstruct(tree)
    dt = tree;
    
    if nargin < 4
      weights = [];
    end
  
    if nargin < 3
      nlab = [];
    end
  
    m = size(a, 1);
    [n, c] = size(dt.post);
    p = zeros([m, c]);
  
    if (nargout < 2) || isempty(nlab)
      st = [];    
      
      for i=1:m
        p(i, :) = mapdt(1, +a(i, :));
      end
      
    else
      st = zeros([n, c]);
      nlab (nlab > c) = 0;
      if isempty(weights)
        weights = ones(m, 1);
      end
      
      for i=1:m
        p(i, :) = mapdt(1, +a(i, :), nlab(i), weights(i));
      end
    end
    
    
    s = st;
    clear dt st
    
  else
    j = tree;

    while j > 0  
      if ~isempty(st) && (nlab > 0)
        st(j, nlab) = st(j, nlab) + weights;
      end
      
      k = 0;
      fidx = dt.fidx(j);

      if fidx ~= 0
        nidx = dt.nidx{j};
        aval = a(fidx);

        if ~isnan(aval)
          fval = dt.fval{j};

          if isempty(fval)
            k = nidx(2 - (aval ~= inf));            
          elseif length(fval) == 1
            if aval ~= inf
              k = nidx(2 - (aval <= fval));            
            elseif length(nidx) == 3
              k = nidx(3);
            end;
          else
            k = nidx(aval == fval);
            if isempty(k)
              k = 0;
            end
          end
          
        else
          p = zeros([1, size(dt.post, 2)]);
          if isempty(st) || (nlab <= 0)
            for b=1:length(nidx)
              k = nidx(b);
              f = (dt.nsmp(k)/dt.nsmp(j));
              p = p + f*mapdt(nidx(b), a);
            end
          else
            for b=1:length(nidx)
              k = nidx(b);
              f = (dt.nsmp(k)/dt.nsmp(j));
              p = p + f*mapdt(nidx(b), a, nlab, f*weights);
            end
          end
          k = -1;
        end
      end

      if k == 0
        p = dt.post(j, :);
      end

      j = k;
    end
  end
    
  return

  
%FINDSPLIT General routine for finding the best split
% 
% 	[FIDX, FVAL, NB, BIDX, CHISQVAL, CRITVAL] = FINDSPLI(A, FEATTYPE, USEFEAT, NFEAT, C, NLAB, WEIGHTS, COST, CRIT)
% 

function [fidx, fval, nb, bidx, chisqval, critval] = findsplit(a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit)
	
    
  selfeatidx = find(usefeat);
  nf = length(selfeatidx);
  if ~isempty(nfeat) && (nfeat < nf)
    permidx = randperm(length(selfeatidx));
    selfeatidx = selfeatidx(permidx(1:nfeat));
    nf = nfeat;
  end

  fval = cell([1, nf]);
  chisqval = zeros([1, nf]);
  critval = nan([1, nf]);
  
  % repeat for all selected features
  for f=1:nf
    fidx = selfeatidx(f);
    af = a(:, fidx);
    
    kidx = ~isnan(af); % known values index
    if nnz(kidx) == 0
      continue
    end
    
    MU = sum(weights(~kidx)) + realmin;

    af = af(kidx);
    wk = weights(kidx);
    nlabk = nlab(kidx);
    
    switch feattype(fidx)
      case 0 % continous/ordered feature
        naidx = af == inf;
        NA = repmat(realmin, [1 c]);
        MNA = c*realmin;
        nlabna = nlabk(naidx);
        nna = length(nlabna);
        if nna > 0
          for j = 1:c
            NA(1,j) = sum(wk(nlabna == j)) + realmin;
          end
          MNA = sum(NA);  
        end
        
        apidx = find(~naidx);
        af = af(apidx);
        wap = wk(apidx);
        nlabap = nlabk(apidx);
        [af, sortidx] = sort(af);
        wap = wap(sortidx);
        nlabap = nlabap(sortidx);
        
        if length(af) == 1
          uv = af;
          ns = 0;
          sli = [];
        else
          labchngcount = cumsum([1; double(diff(nlabap) ~= 0)]);
          uniquevallowidx = find([true; ((diff(af)./(0.5*abs(af(1:end-1) + af(2:end)) + realmin)) > 1e-8)]);
          uniquevalhighidx = [(uniquevallowidx(2:end) - 1); length(af)];
          % unique values 
          uv = af(uniquevalhighidx);

          % split low indices in unique values
          sli = find(labchngcount(uniquevalhighidx(2:end)) - labchngcount(uniquevallowidx(1:end-1)) > 0);
          % split low indices in af
          splitlowidx = uniquevalhighidx(sli);
          ns = length(splitlowidx);
        end
        
        if (ns == 0) && (nna == 0)
          continue
        end
        
        % applicable, left, and right branch class counts
        AP = zeros(1, c);
        L = zeros(ns, c); 
        R = zeros(ns, c);        
        
        for j = 1:c
          J = find(nlabap == j);
          mj = length(J);
          AP(j) = sum(wap(J));
          if (ns > 0) && (mj > 0)
            L(:, j) = (repmat(splitlowidx, [1, mj]) >= repmat(J.', [ns, 1]))*wap(J) + realmin;
            R(:, j) = AP(j) - L(:, j) + realmin;
          end
        end
        AP = AP + 2*realmin;
        
        % total count of applicable
        MAP = sum(AP);
        
        % known
        K = AP + NA;
        MK = MNA + MAP;

        % object counts for branches
        ML = sum(L, 2);
        MR = sum(R, 2); 
        
        [cv, i] = feval(crit, 0, MU, MK, K, MNA, NA, MAP, AP, ML, L, MR, R, cost, weights, uv, sli);
        critval(f) = cv;

        if ~isnan(cv) && (cv > -inf) 
          if (ns > 0) && ~isempty(i)
            t = splitlowidx(i);
            fval{f} = 0.5*(af(t) + af(t+1));

            if nna == 0 
              APL = AP*(ML(i)/MAP);
              APR = AP*(MR(i)/MAP);
              chisqval(f) = sum(((L(i, :) - APL).^2)./APL + ((R(i, :) - APR).^2)./APR);
            else
              KNA = K*(MNA/MK);
              KL = K*(ML(i)/MK);
              KR = K*(MR(i)/MK);
              chisqval(f) = sum(((NA - KNA).^2)./KNA + ((L(i, :) - KL).^2)./KL + ((R(i, :) - KR).^2)./KR);
            end
          else
            fval{f} = [];
            KNA = K*(MNA/MK);
            KAP = K*(MAP/MK);
            chisqval(f) = sum(((NA - KNA).^2)./KNA + ((AP(i, :) - KAP).^2)./KAP);
          end
        end
        
      case 1 % nominal feature
        vf = unique(af);
        v = length(vf);

        B = zeros(v, c);
        for j=1:c
          J = find(j == nlab);
          mj = length(J);
          if mj > 0
            B(:, j) = (repmat(vf, [1, mj]) ==  repmat(af(J).', [v 1]))*weights(J);
          end
        end
        
        B = B + realmin;
        
        % class counts for known
        K = sum(B, 1);

        % total known count
        MK = sum(K);

        % object counts for branches
        MB = sum(B, 2);
        
        cv = feval(crit, 1, MU, MK, K, MB, B, cost, weights);        
        critval(f) = cv;
        
        if ~isnan(cv) && (cv > -inf)
          KB = MB.*K/MK;
          chisqval(f) = sum(sum(((B - KB).^2)./KB));
          fval{f} = bf;
        end
    end
  end
    
  % best criterion over all features
  testfeatidx = find(~isnan(critval) & (critval > -inf));
  if isempty(testfeatidx)
    fidx = [];
    fval = [];
    nb = [];
    bidx = [];
    chisqval = [];
    critval = [];    

  else
    [critval, fidx] = max(critval(testfeatidx));
    fidx = testfeatidx(fidx);
    fval = fval{fidx};
    chisqval = chisqval(fidx);    
    fidx = selfeatidx(fidx);
    
    af = a(:, fidx);
    m = size(af, 1);
    bidx = zeros(m, 1);

    switch feattype(fidx)
      case 0
        apidx = af < inf;

        if ~isempty(fval)
          lidx = af <= fval;
          ridx = apidx & ~lidx;
          bidx(lidx) = 1;
          bidx(ridx) = 2;
          nb = 2;
        else  
          bidx(apidx) = 1;
          nb = 1;
        end
        
        naidx = ~isnan(af) & ~apidx;
        if nnz(naidx) > 0
          nb = nb + 1;
          bidx(naidx) = nb;            
        end
      
      case 1
        nb = length(fval);
        for i=1:nb
          bidx(af == fval(i)) = i;
        end
    end
  end
  
  return

  
%IGR The information gain ratio
% 
% 	[CRITVAL, IDX] = IGR(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
%   [CRITVAL, IDX] = IGR(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS

function [critval, idx] = igr(feattype, varargin) 
	
  	
  switch feattype
    case 0
      [MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});

      M = MK + MU;
      
      infoold = - K * log2(K.'/MK);
      infonew = - NA * log2(NA.'/MNA);
      infosplit = - MU * log2(MU/M) - MNA * log2(MNA/M);      

      if ~isempty(R) 
        infolr = - ( ...
          sum(L .* log2(L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
          sum(R .* log2(R./(repmat(MR, [1 size(R, 2)]))), 2) ...
        );
      
        % best criterion value over all thresholds
        [infolr, idx] = min(infolr);
        
        infonew = infonew + infolr;
        infosplit = infosplit - [ML(idx), MR(idx)] * log2([ML(idx); MR(idx)]/M);

      else
        idx = [];
        infonew = infonew - AP * log2(AP.'/MAP);
        infosplit = infosplit - MAP * log2(MAP/M);
      end
      
      infothresh = log2(max(length(uv) - 1, 1));
      infogain = (infoold - infonew - infothresh);

    case 1
      idx = [];
       [MU, MK, K, MB, B, cost, weights] = deal(varargin{:});
      
      M = MK + MU;
      
      infoold = - K * log2(K.'/MK);
      infosplit = - MU.' * log2(MU/M);
      
      infonew = - sum(sum(B .* log2(B./(repmat(MB, [1 size(B, 2)]))), 2), 1);
      infosplit = infosplit - MB.' * log2(MB/M);
      
      infogain = (infoold - infonew);
  end
  
  % infogain = infogain / M;
  % infosplit = infospit / M;
  
  if infogain > 0
    critval = infogain / infosplit;    
  else
    critval = -inf;
  end
  
  return

%GINI 
% 
% 	[CRITVAL, IDX] = GINI(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
%   [CRITVAL, IDX] = GINI(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS% 

function [critval, idx] = gini(feattype, varargin) 
	
  	
  switch feattype
    case 0
      [MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});
      
      if isempty(cost)
        purityold = K * (K.'/MK);
        %impurityold = MK - purityold;

        if ~isempty(R)
          puritylr = ( ...
            sum(L .* (L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
            sum(R .* (R./(repmat(MR, [1 size(R, 2)]))), 2) ...
          );  

          % best criterion value over all thresholds
          [puritylr, idx] = max(puritylr);
          puritynew = puritylr + NA * (NA.'/MNA);
        
        else
          idx = [];
          puritynew = AP * (AP.'/ MAP) + NA * (NA.'/MNA);
        end
        
        %impuritynew = MK - puritynew;
        deltaimpurity = puritynew - purityold;

      else
        impurityold = K * cost * (K.'/MK);
        
        if ~isempty(R)
          impuritylr = ...
            sum((L*cost) .* (L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
            sum((R*cost) .* (R./(repmat(MR, [1 size(R, 2)]))), 2);
 
          [impuritylr, idx] = min(impuritylr);
          impuritynew = impuritylr + NA * cost * (NA.'/MNA);
        
        else
          idx = [];
          impuritynew = AP * cost* (AP.'/ MAP) + NA * cost *(NA.'/MNA);
        end
        
        deltaimpurity = impurityold - impuritynew;
      end
      
    case 1
      idx = [];
      [MU, MK, K, MB, B, cost, weights] = deal(varargin{:});
      
      if isempty(cost)
        purityold = K * (K.'/MK);
        %impurityold = MK - purityold;        
        puritynew = sum(sum(B .* (B./(repmat(MB, [1 size(B, 2)]))), 2));
        %impuritynew = MK - puritynew;        
        deltaimpurity = puritynew - purityold;
      
      else
        impurityold = K * cost * (K.'/MK);        
        impuritynew = ( ...
          sum(sum((B*cost) .* (B./(repmat(MB, [1 size(B, 2)]))), 2), 1) ...
        );
      
        deltaimpurity = impurityold - impuritynew;
      end
  end
  
  
  if isempty(cost)
    deltamin = 0.1;
  else
    deltamin = 0.1*min(abs(cost(abs(cost) > 0)));
  end
  
  
  if deltaimpurity > deltamin
    critval = deltaimpurity/(MU+MK);
  else
    critval = -inf;
  end
  
  return

%MISCLS Miscalssification error
% 
% 	[CRITVAL, IDX] = MISCLS(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
%   [CRITVAL, IDX] = MISCLS(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS
% 

function [critval, idx] = miscls(feattype, varargin) 
	
  	
  switch feattype
    case 0
      [MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});      

      if isempty(cost)
        ccold = max(K, [], 2);
        if ~isempty(R)
          ccnew = max(L, [], 2) + max(R, [], 2);
          [ccnew, idx] = max(ccnew);
          ccnew = ccnew + max(NA, [], 2);
        else
          idx = [];
          ccnew = max(AP, [], 2) + max(NA, [], 2);
        end
        
        deltamc = ccnew - ccold;
      
      else
        mcold = min(K*cost, [], 2);
        if ~isempty(R)
          mcnew = min(L*cost, [], 2) + min(R*cost, [], 2);
          [mcnew, idx] = min(mcnew);     
           mcnew = mcnew + min(NA*cost, [], 2);
        else
          idx = [];
          mcnew = min(AP*cost, [], 2) + min(NA*cost, [], 2);          
        end
        
        deltamc = mcold - mcnew; 
      end

    case 1
      idx = [];
      [MU, MK, K, MB, B, cost, weights] = deal(varargin{:});      

      if isempty(cost)
        ccold = max(K, [], 2);        
        ccnew = sum(max(B, [], 2), 1);
        deltamc = ccnew - ccold;        
      else
        mcold = min(K*cost, [], 2);        
        mcnew  = sum(min(B*cost, [], 2), 1);
        deltamc = mcold - mcnew; 
      end
  end
  

  %if isempty(cost)
  %  deltamin = min(weights(weights > 0));
  %else
  %  deltamin = min(weights(weights > 0))*min(cost(cost > 0));
  %end
  %
  %if ~isempty(deltamin)
  %  deltamin = 0.5*deltamin;
  %else
  %  deltamin = 0;
  %end
  %
  %if deltamc <= deltamin
  %  critval = -inf;
  %else
  %  critval = deltamc / (MU+MK);
  %end
  
  critval = deltamc / (MU+MK);
  
  return

%PRUNEP Pessimistic pruning of a decision tree
% 
% 	TREE = PRUNEP(TREE,NODE)
% 
% Pessimistic pruning defined by Quinlan.

function tree = prunep(tree, node)
	
    
  persistent pt;
  
  if (nargout ~= 0) || (nargin ~= 1) || ~isscalar(tree) || ~isnumeric(tree) || ~isint(tree)
    if nargin < 2 || isempty(node)
      node = 1;
    end
    
    pt = tree;
    prunep(node);
    tree = pt;
    clear pt;
  
  else
    node = tree;
    if pt.fidx(node) > 0
      tnidx = (node+1):(node + pt.size(node) - 1);

      nleaves = nnz(pt.fidx(tnidx) == 0);
      errt = pt.errt(node) + 0.5*nleaves;
      errl = pt.errl(node) + 0.5;
      nsmp = pt.nsmp(node);
      sd = sqrt(errt*(1-errt/nsmp));

      if errl < (errt + sd)
        pt.fidx(tnidx) = -1;

        pt.errt(node) = pt.errl(node);
        pt.size(node) = 1;
        pt.fidx(node) = 0;
        pt.fval(node) = {[]};
        pt.nidx(node) = {[]};

      else
        errt = 0;
        for i=1:length(pt.nidx{node})
          idx = pt.nidx{node}(i);
          prunep(idx);
          errt = errt + pt.errt(idx); 
        end
        pt.errt(idx) = errt;
      end
    end
  end

  return

%PRUNET Prune tree by testset
% 
% 	TREE = PRUNET(TREE,T,COST)
% 
% The test set a is used to prune a decision tree. 

function tree = prunet(tree, t, cost)
	
  	
  persistent pt;
  
  if (nargout ~= 0) || (nargin ~= 1) || ~isscalar(tree) || ~isnumeric(tree) || ~isint(tree)
    if nargin < 3
      cost = [];
    end
    
    [m, k, c] = getsize(t);
	
    cs = classsizes(t);
    prior = getprior(t);
    prior = prior/sum(prior);
    weights = m*prior./cs;

    pt = tree;
    mt = size(pt.post,1);
    [p, s] = mapdt(pt, +t, getnlab(t), weights);
	
    % error (cost) in each node as if there were leafs
    if isempty(cost)
      idx = sub2ind([mt, c], (1:mt).', pt.cidx);  
      pt.test_errl = sum(s,2) - s(idx);
    else
      pt.test_errl = sum(s .* cost(:, pt.cidx).', 2);  
    end
    
    pt.test_errt = pt.test_errl;
    
    prunet(1);
    tree = pt;
    clear pt;

  else
    node = tree;
    if pt.fidx(node) > 0
      test_errt = 0;
      errt = 0;

      for i=1:length(pt.nidx{node})
        idx = pt.nidx{node}(i);
        prunet(idx);
        test_errt = test_errt + pt.test_errt(idx);
        errt = errt + pt.errt(idx);
      end

      if pt.test_errl(node) <= test_errt
        pt.fidx(pt.nidx{node}) = -1;

        pt.errt(node) = pt.errl(node);    
        pt.size(node) = 1;
        pt.fidx(node) = 0;
        pt.fval(node) = {[]};
        pt.nidx(node) = {[]};

      else
        pt.test_errt(node) = test_errt;
        pt.errt(node) = errt;
      end
    end
  end;
  
  return
  
function tree = cleandt(tree)
	
    
  rnidx = tree.fidx == -1;
  if nnz(rnidx) == 0
    return
  end
  
  rncount = cumsum(rnidx);
  fn = fieldnames(tree);
  for i=1:length(fn)
    tree.(fn{i})(rnidx, :) = [];
  end
  
  tree.nidx = cellfun(@(x) x - rncount(x).', tree.nidx, 'UniformOutput', false);

  idx = (1:length(rnidx)).';
  idx(rnidx) = [];
  tree.size = tree.size - (rncount(idx) - rncount(idx + tree.size - 1));

 	return