Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
DecisionTree.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TMVA::DecisionTree *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation of a Decision Tree *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://mva.sourceforge.net/license.txt) *
29  * *
30  **********************************************************************************/
31 
32 /*! \class TMVA::DecisionTree
33 \ingroup TMVA
34 
35 Implementation of a Decision Tree
36 
37 In a decision tree successive decision nodes are used to categorize the
38 events out of the sample as either signal or background. Each node
39 uses only a single discriminating variable to decide if the event is
40 signal-like ("goes right") or background-like ("goes left"). This
41 forms a tree like structure with "baskets" at the end (leave nodes),
42 and an event is classified as either signal or background according to
43 whether the basket where it ends up has been classified signal or
44 background during the training. Training of a decision tree is the
45 process to define the "cut criteria" for each node. The training
46 starts with the root node. Here one takes the full training event
47 sample and selects the variable and corresponding cut value that gives
48 the best separation between signal and background at this stage. Using
49 this cut criterion, the sample is then divided into two subsamples, a
50 signal-like (right) and a background-like (left) sample. Two new nodes
51 are then created for each of the two sub-samples and they are
52 constructed using the same mechanism as described for the root
53 node. The devision is stopped once a certain node has reached either a
54 minimum number of events, or a minimum or maximum signal purity. These
55 leave nodes are then called "signal" or "background" if they contain
56 more signal respective background events from the training sample.
57 
58 */
59 
60 #include <iostream>
61 #include <algorithm>
62 #include <vector>
63 #include <limits>
64 #include <fstream>
65 #include <algorithm>
66 #include <cassert>
67 
68 #include "TRandom3.h"
69 #include "TMath.h"
70 #include "TMatrix.h"
71 #include "TStopwatch.h"
72 
73 #include "TMVA/MsgLogger.h"
74 #include "TMVA/DecisionTree.h"
75 #include "TMVA/DecisionTreeNode.h"
76 #include "TMVA/BinarySearchTree.h"
77 
78 #include "TMVA/Tools.h"
79 #include "TMVA/Config.h"
80 
81 #include "TMVA/GiniIndex.h"
82 #include "TMVA/CrossEntropy.h"
84 #include "TMVA/SdivSqrtSplusB.h"
85 #include "TMVA/Event.h"
86 #include "TMVA/BDTEventWrapper.h"
87 #include "TMVA/IPruneTool.h"
90 
91 const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
92 
93 using std::vector;
94 
95 ClassImp(TMVA::DecisionTree);
96 
97 bool almost_equal_float(float x, float y, int ulp=4){
98  // the machine epsilon has to be scaled to the magnitude of the values used
99  // and multiplied by the desired precision in ULPs (units in the last place)
100  return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
101  // unless the result is subnormal
102  || std::abs(x-y) < std::numeric_limits<float>::min();
103 }
104 
105 bool almost_equal_double(double x, double y, int ulp=4){
106  // the machine epsilon has to be scaled to the magnitude of the values used
107  // and multiplied by the desired precision in ULPs (units in the last place)
108  return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
109  // unless the result is subnormal
110  || std::abs(x-y) < std::numeric_limits<double>::min();
111 }
112 
113 ////////////////////////////////////////////////////////////////////////////////
114 /// default constructor using the GiniIndex as separation criterion,
115 /// no restrictions on minium number of events in a leave note or the
116 /// separation gain in the node splitting
117 
118 TMVA::DecisionTree::DecisionTree():
119  BinaryTree(),
120  fNvars (0),
121  fNCuts (-1),
122  fUseFisherCuts (kFALSE),
123  fMinLinCorrForFisher (1),
124  fUseExclusiveVars (kTRUE),
125  fSepType (NULL),
126  fRegType (NULL),
127  fMinSize (0),
128  fMinNodeSize (1),
129  fMinSepGain (0),
130  fUseSearchTree(kFALSE),
131  fPruneStrength(0),
132  fPruneMethod (kNoPruning),
133  fNNodesBeforePruning(0),
134  fNodePurityLimit(0.5),
135  fRandomisedTree (kFALSE),
136  fUseNvars (0),
137  fUsePoissonNvars(kFALSE),
138  fMyTrandom (NULL),
139  fMaxDepth (999999),
140  fSigClass (0),
141  fTreeID (0),
142  fAnalysisType (Types::kClassification),
143  fDataSetInfo (NULL)
144 
145 {}
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 /// constructor specifying the separation type, the min number of
149 /// events in a no that is still subjected to further splitting, the
150 /// number of bins in the grid used in applying the cut for the node
151 /// splitting.
152 
153 TMVA::DecisionTree::DecisionTree( TMVA::SeparationBase *sepType, Float_t minSize, Int_t nCuts, DataSetInfo* dataInfo, UInt_t cls,
154  Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
155  UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
156  BinaryTree(),
157  fNvars (0),
158  fNCuts (nCuts),
159  fUseFisherCuts (kFALSE),
160  fMinLinCorrForFisher (1),
161  fUseExclusiveVars (kTRUE),
162  fSepType (sepType),
163  fRegType (NULL),
164  fMinSize (0),
165  fMinNodeSize (minSize),
166  fMinSepGain (0),
167  fUseSearchTree (kFALSE),
168  fPruneStrength (0),
169  fPruneMethod (kNoPruning),
170  fNNodesBeforePruning(0),
171  fNodePurityLimit(purityLimit),
172  fRandomisedTree (randomisedTree),
173  fUseNvars (useNvars),
174  fUsePoissonNvars(usePoissonNvars),
175  fMyTrandom (new TRandom3(iSeed)),
176  fMaxDepth (nMaxDepth),
177  fSigClass (cls),
178  fTreeID (treeID),
179  fAnalysisType (Types::kClassification),
180  fDataSetInfo (dataInfo)
181 {
182  if (sepType == NULL) { // it is interpreted as a regression tree, where
183  // currently the separation type (simple least square)
184  // cannot be chosen freely)
185  fAnalysisType = Types::kRegression;
186  fRegType = new RegressionVariance();
187  if ( nCuts <=0 ) {
188  fNCuts = 200;
189  Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
190  << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
191  << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
192  << Endl;
193  }
194  }else{
195  fAnalysisType = Types::kClassification;
196  }
197 }
198 
199 ////////////////////////////////////////////////////////////////////////////////
200 /// copy constructor that creates a true copy, i.e. a completely independent tree
201 /// the node copy will recursively copy all the nodes
202 
203 TMVA::DecisionTree::DecisionTree( const DecisionTree &d ):
204  BinaryTree(),
205  fNvars (d.fNvars),
206  fNCuts (d.fNCuts),
207  fUseFisherCuts (d.fUseFisherCuts),
208  fMinLinCorrForFisher (d.fMinLinCorrForFisher),
209  fUseExclusiveVars (d.fUseExclusiveVars),
210  fSepType (d.fSepType),
211  fRegType (d.fRegType),
212  fMinSize (d.fMinSize),
213  fMinNodeSize(d.fMinNodeSize),
214  fMinSepGain (d.fMinSepGain),
215  fUseSearchTree (d.fUseSearchTree),
216  fPruneStrength (d.fPruneStrength),
217  fPruneMethod (d.fPruneMethod),
218  fNodePurityLimit(d.fNodePurityLimit),
219  fRandomisedTree (d.fRandomisedTree),
220  fUseNvars (d.fUseNvars),
221  fUsePoissonNvars(d.fUsePoissonNvars),
222  fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
223  fMaxDepth (d.fMaxDepth),
224  fSigClass (d.fSigClass),
225  fTreeID (d.fTreeID),
226  fAnalysisType(d.fAnalysisType),
227  fDataSetInfo (d.fDataSetInfo)
228 {
229  this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
230  this->SetParentTreeInNodes();
231  fNNodes = d.fNNodes;
232 
233 }
234 
235 
236 ////////////////////////////////////////////////////////////////////////////////
237 /// destructor
238 
239 TMVA::DecisionTree::~DecisionTree()
240 {
241  // destruction of the tree nodes done in the "base class" BinaryTree
242 
243  if (fMyTrandom) delete fMyTrandom;
244  if (fRegType) delete fRegType;
245 }
246 
247 ////////////////////////////////////////////////////////////////////////////////
248 /// descend a tree to find all its leaf nodes, fill max depth reached in the
249 /// tree at the same time.
250 
251 void TMVA::DecisionTree::SetParentTreeInNodes( Node *n )
252 {
253  if (n == NULL) { //default, start at the tree top, then descend recursively
254  n = this->GetRoot();
255  if (n == NULL) {
256  Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
257  return ;
258  }
259  }
260 
261  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
262  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
263  return;
264  } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
265  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
266  return;
267  }
268  else {
269  if (this->GetLeftDaughter(n) != NULL) {
270  this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
271  }
272  if (this->GetRightDaughter(n) != NULL) {
273  this->SetParentTreeInNodes( this->GetRightDaughter(n) );
274  }
275  }
276  n->SetParentTree(this);
277  if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
278  return;
279 }
280 
281 ////////////////////////////////////////////////////////////////////////////////
282 /// re-create a new tree (decision tree or search tree) from XML
283 
284 TMVA::DecisionTree* TMVA::DecisionTree::CreateFromXML(void* node, UInt_t tmva_Version_Code ) {
285  std::string type("");
286  gTools().ReadAttr(node,"type", type);
287  DecisionTree* dt = new DecisionTree();
288 
289  dt->ReadXML( node, tmva_Version_Code );
290  return dt;
291 }
292 
293 // #### Multithreaded DecisionTree::BuildTree
294 #ifdef R__USE_IMT
295 //====================================================================================
296 // Had to define this struct to enable parallelization.
297 // Using BuildNodeInfo, each thread can return the information needed.
298 // After each thread returns we can then merge the results, hence the + operator.
299 //====================================================================================
300 struct BuildNodeInfo{
301 
302  BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
303 
304  nvars = fNvars;
305  xmin = std::vector<Float_t>(nvars);
306  xmax = std::vector<Float_t>(nvars);
307 
308  // #### the initial min and max for each feature
309  for (Int_t ivar=0; ivar<fNvars; ivar++) {
310  const Double_t val = evt->GetValueFast(ivar);
311  xmin[ivar]=val;
312  xmax[ivar]=val;
313  }
314  };
315 
316  BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
317 
318  nvars = fNvars;
319  xmin = std::vector<Float_t>(nvars);
320  xmax = std::vector<Float_t>(nvars);
321 
322  // #### the initial min and max for each feature
323  for (Int_t ivar=0; ivar<fNvars; ivar++) {
324  xmin[ivar]=inxmin[ivar];
325  xmax[ivar]=inxmax[ivar];
326  }
327  };
328  BuildNodeInfo(){};
329 
330  Int_t nvars = 0;
331  Double_t s = 0;
332  Double_t suw = 0;
333  Double_t sub = 0;
334  Double_t b = 0;
335  Double_t buw = 0;
336  Double_t bub = 0;
337  Double_t target = 0;
338  Double_t target2 = 0;
339  std::vector<Float_t> xmin;
340  std::vector<Float_t> xmax;
341 
342  // #### Define the addition operator for BuildNodeInfo
343  // #### Make sure both BuildNodeInfos have the same nvars if we add them
344  BuildNodeInfo operator+(const BuildNodeInfo& other)
345  {
346  BuildNodeInfo ret(nvars, xmin, xmax);
347  if(nvars != other.nvars)
348  {
349  std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
350  return ret;
351  }
352  ret.s = s + other.s;
353  ret.suw = suw + other.suw;
354  ret.sub = sub + other.sub;
355  ret.b = b + other.b;
356  ret.buw = buw + other.buw;
357  ret.bub = bub + other.bub;
358  ret.target = target + other.target;
359  ret.target2 = target2 + other.target2;
360 
361  // xmin is the min of both, xmax is the max of both
362  for(Int_t i=0; i<nvars; i++)
363  {
364  ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
365  ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
366  }
367  return ret;
368  };
369 
370 };
371 //===========================================================================
372 // Done with BuildNodeInfo declaration
373 //===========================================================================
374 
375 
376 ////////////////////////////////////////////////////////////////////////////////
377 /// building the decision tree by recursively calling the splitting of
378 /// one (root-) node into two daughter nodes (returns the number of nodes)
379 
380 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
381  TMVA::DecisionTreeNode *node)
382 {
383  if (node==NULL) {
384  //start with the root node
385  node = new TMVA::DecisionTreeNode();
386  fNNodes = 1;
387  this->SetRoot(node);
388  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
389  this->GetRoot()->SetPos('s');
390  this->GetRoot()->SetDepth(0);
391  this->GetRoot()->SetParentTree(this);
392  fMinSize = fMinNodeSize/100. * eventSample.size();
393  if (GetTreeID()==0){
394  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
395  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
396  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
397  }
398  }
399 
400  UInt_t nevents = eventSample.size();
401 
402  if (nevents > 0 ) {
403  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
404  fVariableImportance.resize(fNvars);
405  }
406  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
407 
408  // sum up the totals
409  // sig and bkg for classification
410  // err and err2 for regression
411 
412  // #### Set up prerequisite info for multithreading
413  UInt_t nPartitions = TMVA::Config::Instance().GetThreadExecutor().GetPoolSize();
414  auto seeds = ROOT::TSeqU(nPartitions);
415 
416  // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
417  auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
418 
419  Int_t start = 1.0*partition/nPartitions*eventSample.size();
420  Int_t end = (partition+1.0)/nPartitions*eventSample.size();
421 
422  BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
423 
424  for(Int_t iev=start; iev<end; iev++){
425  const TMVA::Event* evt = eventSample[iev];
426  const Double_t weight = evt->GetWeight();
427  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
428  if (evt->GetClass() == fSigClass) {
429  nodeInfof.s += weight;
430  nodeInfof.suw += 1;
431  nodeInfof.sub += orgWeight;
432  }
433  else {
434  nodeInfof.b += weight;
435  nodeInfof.buw += 1;
436  nodeInfof.bub += orgWeight;
437  }
438  if ( DoRegression() ) {
439  const Double_t tgt = evt->GetTarget(0);
440  nodeInfof.target +=weight*tgt;
441  nodeInfof.target2+=weight*tgt*tgt;
442  }
443 
444  // save the min and max for each feature
445  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
446  const Double_t val = evt->GetValueFast(ivar);
447  if (iev==start){
448  nodeInfof.xmin[ivar]=val;
449  nodeInfof.xmax[ivar]=val;
450  }
451  if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
452  if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
453  }
454  }
455  return nodeInfof;
456  };
457 
458  // #### Need an initial struct to pass to std::accumulate
459  BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
460 
461  // #### Run the threads in parallel then merge the results
462  auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
463  BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
464  //NodeInfo nodeInfo(fNvars);
465 
466  if (nodeInfo.s+nodeInfo.b < 0) {
467  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
468  << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
469  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
470  << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
471  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
472  << "to allow for reasonable averaging!!!" << Endl
473  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
474  << "with negative weight in the training." << Endl;
475  double nBkg=0.;
476  for (UInt_t i=0; i<eventSample.size(); i++) {
477  if (eventSample[i]->GetClass() != fSigClass) {
478  nBkg += eventSample[i]->GetWeight();
479  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
480  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
481  }
482  }
483  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
484  }
485 
486  node->SetNSigEvents(nodeInfo.s);
487  node->SetNBkgEvents(nodeInfo.b);
488  node->SetNSigEvents_unweighted(nodeInfo.suw);
489  node->SetNBkgEvents_unweighted(nodeInfo.buw);
490  node->SetNSigEvents_unboosted(nodeInfo.sub);
491  node->SetNBkgEvents_unboosted(nodeInfo.bub);
492  node->SetPurity();
493  if (node == this->GetRoot()) {
494  node->SetNEvents(nodeInfo.s+nodeInfo.b);
495  node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
496  node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
497  }
498 
499  // save the min and max for each feature
500  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
501  node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
502  node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
503  }
504 
505  // I now demand the minimum number of events for both daughter nodes. Hence if the number
506  // of events in the parent node is not at least two times as big, I don't even need to try
507  // splitting
508 
509  // ask here for actual "events" independent of their weight.. OR the weighted events
510  // to exceed the min requested number of events per daughter node
511  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
512  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
513  // std::cout << "------------------------------------------------------------------"<<std::endl;
514  // std::cout << "------------------------------------------------------------------"<<std::endl;
515  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
516  if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
517  && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
518 
519  // Train the node and figure out the separation gain and split points
520  Double_t separationGain;
521  if (fNCuts > 0){
522  separationGain = this->TrainNodeFast(eventSample, node);
523  }
524  else {
525  separationGain = this->TrainNodeFull(eventSample, node);
526  }
527 
528  // The node has been trained and there is nothing to be gained by splitting
529  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
530  // no cut can actually do anything to improve the node
531  // hence, naturally, the current node is a leaf node
532  if (DoRegression()) {
533  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
534  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
535  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
536  node->SetRMS(0);
537  }else{
538  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
539  }
540  }
541  else {
542  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
543  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
544  else node->SetNodeType(-1);
545  }
546  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
547  }
548  else {
549  // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
550  // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
551  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
552  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
553 
554  Double_t nRight=0, nLeft=0;
555  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
556 
557  for (UInt_t ie=0; ie< nevents ; ie++) {
558  if (node->GoesRight(*eventSample[ie])) {
559  rightSample.push_back(eventSample[ie]);
560  nRight += eventSample[ie]->GetWeight();
561  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
562  }
563  else {
564  leftSample.push_back(eventSample[ie]);
565  nLeft += eventSample[ie]->GetWeight();
566  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
567  }
568  }
569  // sanity check
570  if (leftSample.empty() || rightSample.empty()) {
571 
572  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
573  << "--- Hence new node == old node ... check" << Endl
574  << "--- left:" << leftSample.size()
575  << " right:" << rightSample.size() << Endl
576  << " while the separation is thought to be " << separationGain
577  << "\n when cutting on variable " << node->GetSelector()
578  << " at value " << node->GetCutValue()
579  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
580  }
581 
582  // continue building daughter nodes for the left and the right eventsample
583  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
584  fNNodes++;
585  rightNode->SetNEvents(nRight);
586  rightNode->SetNEvents_unboosted(nRightUnBoosted);
587  rightNode->SetNEvents_unweighted(rightSample.size());
588 
589  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
590 
591  fNNodes++;
592  leftNode->SetNEvents(nLeft);
593  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
594  leftNode->SetNEvents_unweighted(leftSample.size());
595 
596  node->SetNodeType(0);
597  node->SetLeft(leftNode);
598  node->SetRight(rightNode);
599 
600  this->BuildTree(rightSample, rightNode);
601  this->BuildTree(leftSample, leftNode );
602 
603  }
604  }
605  else{ // it is a leaf node
606  if (DoRegression()) {
607  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
608  node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
609  if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
610  node->SetRMS(0);
611  }else{
612  node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
613  }
614  }
615  else {
616  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
617  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
618  else node->SetNodeType(-1);
619  // loop through the event sample ending up in this node and check for events with negative weight
620  // those "cannot" be boosted normally. Hence, if there is one of those
621  // is misclassified, find randomly as many events with positive weights in this
622  // node as needed to get the same absolute number of weight, and mark them as
623  // "not to be boosted" in order to make up for not boosting the negative weight event
624  }
625 
626 
627  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
628  }
629 
630  // if (IsRootNode) this->CleanTree();
631  return fNNodes;
632 }
633 
634 // Standard DecisionTree::BuildTree (multithreading is not enabled)
635 #else
636 
637 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
638  TMVA::DecisionTreeNode *node)
639 {
640  if (node==NULL) {
641  //start with the root node
642  node = new TMVA::DecisionTreeNode();
643  fNNodes = 1;
644  this->SetRoot(node);
645  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
646  this->GetRoot()->SetPos('s');
647  this->GetRoot()->SetDepth(0);
648  this->GetRoot()->SetParentTree(this);
649  fMinSize = fMinNodeSize/100. * eventSample.size();
650  if (GetTreeID()==0){
651  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
652  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
653  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
654  }
655  }
656 
657  UInt_t nevents = eventSample.size();
658 
659  if (nevents > 0 ) {
660  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
661  fVariableImportance.resize(fNvars);
662  }
663  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
664 
665  Double_t s=0, b=0;
666  Double_t suw=0, buw=0;
667  Double_t sub=0, bub=0; // unboosted!
668  Double_t target=0, target2=0;
669  Float_t *xmin = new Float_t[fNvars];
670  Float_t *xmax = new Float_t[fNvars];
671 
672  // initializing xmin and xmax for each variable
673  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
674  xmin[ivar]=xmax[ivar]=0;
675  }
676  // sum up the totals
677  // sig and bkg for classification
678  // err and err2 for regression
679  for (UInt_t iev=0; iev<eventSample.size(); iev++) {
680  const TMVA::Event* evt = eventSample[iev];
681  const Double_t weight = evt->GetWeight();
682  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
683  if (evt->GetClass() == fSigClass) {
684  s += weight;
685  suw += 1;
686  sub += orgWeight;
687  }
688  else {
689  b += weight;
690  buw += 1;
691  bub += orgWeight;
692  }
693  if ( DoRegression() ) {
694  const Double_t tgt = evt->GetTarget(0);
695  target +=weight*tgt;
696  target2+=weight*tgt*tgt;
697  }
698 
699  // save the min and max for each feature
700  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
701  const Double_t val = evt->GetValueFast(ivar);
702  if (iev==0) xmin[ivar]=xmax[ivar]=val;
703  if (val < xmin[ivar]) xmin[ivar]=val;
704  if (val > xmax[ivar]) xmax[ivar]=val;
705  }
706  }
707 
708 
709  if (s+b < 0) {
710  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
711  << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
712  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
713  << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
714  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
715  << "to allow for reasonable averaging!!!" << Endl
716  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
717  << "with negative weight in the training." << Endl;
718  double nBkg=0.;
719  for (UInt_t i=0; i<eventSample.size(); i++) {
720  if (eventSample[i]->GetClass() != fSigClass) {
721  nBkg += eventSample[i]->GetWeight();
722  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
723  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
724  }
725  }
726  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
727  }
728 
729  node->SetNSigEvents(s);
730  node->SetNBkgEvents(b);
731  node->SetNSigEvents_unweighted(suw);
732  node->SetNBkgEvents_unweighted(buw);
733  node->SetNSigEvents_unboosted(sub);
734  node->SetNBkgEvents_unboosted(bub);
735  node->SetPurity();
736  if (node == this->GetRoot()) {
737  node->SetNEvents(s+b);
738  node->SetNEvents_unweighted(suw+buw);
739  node->SetNEvents_unboosted(sub+bub);
740  }
741 
742  // save the min and max for each feature
743  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
744  node->SetSampleMin(ivar,xmin[ivar]);
745  node->SetSampleMax(ivar,xmax[ivar]);
746 
747  }
748  delete[] xmin;
749  delete[] xmax;
750 
751  // I now demand the minimum number of events for both daughter nodes. Hence if the number
752  // of events in the parent node is not at least two times as big, I don't even need to try
753  // splitting
754 
755  // ask here for actuall "events" independent of their weight.. OR the weighted events
756  // to execeed the min requested number of events per dauther node
757  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
758  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
759  // std::cout << "------------------------------------------------------------------"<<std::endl;
760  // std::cout << "------------------------------------------------------------------"<<std::endl;
761  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
762  if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
763  && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
764  Double_t separationGain;
765  if (fNCuts > 0){
766  separationGain = this->TrainNodeFast(eventSample, node);
767  } else {
768  separationGain = this->TrainNodeFull(eventSample, node);
769  }
770  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
771  /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
772  // no cut can actually do anything to improve the node
773  // hence, naturally, the current node is a leaf node
774  if (DoRegression()) {
775  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
776  node->SetResponse(target/(s+b));
777  if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
778  node->SetRMS(0);
779  }else{
780  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
781  }
782  }
783  else {
784  node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
785 
786  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
787  else node->SetNodeType(-1);
788  }
789  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
790 
791  } else {
792 
793  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
794  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
795 
796  Double_t nRight=0, nLeft=0;
797  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
798 
799  for (UInt_t ie=0; ie< nevents ; ie++) {
800  if (node->GoesRight(*eventSample[ie])) {
801  rightSample.push_back(eventSample[ie]);
802  nRight += eventSample[ie]->GetWeight();
803  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
804  }
805  else {
806  leftSample.push_back(eventSample[ie]);
807  nLeft += eventSample[ie]->GetWeight();
808  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
809  }
810  }
811 
812  // sanity check
813  if (leftSample.empty() || rightSample.empty()) {
814 
815  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
816  << "--- Hence new node == old node ... check" << Endl
817  << "--- left:" << leftSample.size()
818  << " right:" << rightSample.size() << Endl
819  << " while the separation is thought to be " << separationGain
820  << "\n when cutting on variable " << node->GetSelector()
821  << " at value " << node->GetCutValue()
822  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
823  }
824 
825  // continue building daughter nodes for the left and the right eventsample
826  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
827  fNNodes++;
828  rightNode->SetNEvents(nRight);
829  rightNode->SetNEvents_unboosted(nRightUnBoosted);
830  rightNode->SetNEvents_unweighted(rightSample.size());
831 
832  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
833 
834  fNNodes++;
835  leftNode->SetNEvents(nLeft);
836  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
837  leftNode->SetNEvents_unweighted(leftSample.size());
838 
839  node->SetNodeType(0);
840  node->SetLeft(leftNode);
841  node->SetRight(rightNode);
842 
843  this->BuildTree(rightSample, rightNode);
844  this->BuildTree(leftSample, leftNode );
845 
846  }
847  }
848  else{ // it is a leaf node
849  if (DoRegression()) {
850  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
851  node->SetResponse(target/(s+b));
852  if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
853  node->SetRMS(0);
854  }else{
855  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
856  }
857  }
858  else {
859  node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
860  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
861  else node->SetNodeType(-1);
862  // loop through the event sample ending up in this node and check for events with negative weight
863  // those "cannot" be boosted normally. Hence, if there is one of those
864  // is misclassified, find randomly as many events with positive weights in this
865  // node as needed to get the same absolute number of weight, and mark them as
866  // "not to be boosted" in order to make up for not boosting the negative weight event
867  }
868 
869 
870  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
871  }
872 
873  // if (IsRootNode) this->CleanTree();
874  return fNNodes;
875 }
876 
877 #endif
878 
879 ////////////////////////////////////////////////////////////////////////////////
880 /// fill the existing the decision tree structure by filling event
881 /// in from the top node and see where they happen to end up
882 
883 void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
884 {
885  for (UInt_t i=0; i<eventSample.size(); i++) {
886  this->FillEvent(*(eventSample[i]),NULL);
887  }
888 }
889 
890 ////////////////////////////////////////////////////////////////////////////////
891 /// fill the existing the decision tree structure by filling event
892 /// in from the top node and see where they happen to end up
893 
894 void TMVA::DecisionTree::FillEvent( const TMVA::Event & event,
895  TMVA::DecisionTreeNode *node )
896 {
897  if (node == NULL) { // that's the start, take the Root node
898  node = this->GetRoot();
899  }
900 
901  node->IncrementNEvents( event.GetWeight() );
902  node->IncrementNEvents_unweighted( );
903 
904  if (event.GetClass() == fSigClass) {
905  node->IncrementNSigEvents( event.GetWeight() );
906  node->IncrementNSigEvents_unweighted( );
907  }
908  else {
909  node->IncrementNBkgEvents( event.GetWeight() );
910  node->IncrementNBkgEvents_unweighted( );
911  }
912  node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
913  node->GetNBkgEvents()));
914 
915  if (node->GetNodeType() == 0) { //intermediate node --> go down
916  if (node->GoesRight(event))
917  this->FillEvent(event, node->GetRight());
918  else
919  this->FillEvent(event, node->GetLeft());
920  }
921 }
922 
923 ////////////////////////////////////////////////////////////////////////////////
924 /// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
925 
926 void TMVA::DecisionTree::ClearTree()
927 {
928  if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
929 
930 }
931 
932 ////////////////////////////////////////////////////////////////////////////////
933 /// remove those last splits that result in two leaf nodes that
934 /// are both of the type (i.e. both signal or both background)
935 /// this of course is only a reasonable thing to do when you use
936 /// "YesOrNo" leafs, while it might loose s.th. if you use the
937 /// purity information in the nodes.
938 /// --> hence I don't call it automatically in the tree building
939 
940 UInt_t TMVA::DecisionTree::CleanTree( DecisionTreeNode *node )
941 {
942  if (node==NULL) {
943  node = this->GetRoot();
944  }
945 
946  DecisionTreeNode *l = node->GetLeft();
947  DecisionTreeNode *r = node->GetRight();
948 
949  if (node->GetNodeType() == 0) {
950  this->CleanTree(l);
951  this->CleanTree(r);
952  if (l->GetNodeType() * r->GetNodeType() > 0) {
953 
954  this->PruneNode(node);
955  }
956  }
957  // update the number of nodes after the cleaning
958  return this->CountNodes();
959 
960 }
961 
962 ////////////////////////////////////////////////////////////////////////////////
963 /// prune (get rid of internal nodes) the Decision tree to avoid overtraining
964 /// several different pruning methods can be applied as selected by the
965 /// variable "fPruneMethod".
966 
967 Double_t TMVA::DecisionTree::PruneTree( const EventConstList* validationSample )
968 {
969  IPruneTool* tool(NULL);
970  PruningInfo* info(NULL);
971 
972  if( fPruneMethod == kNoPruning ) return 0.0;
973 
974  if (fPruneMethod == kExpectedErrorPruning)
975  // tool = new ExpectedErrorPruneTool(logfile);
976  tool = new ExpectedErrorPruneTool();
977  else if (fPruneMethod == kCostComplexityPruning)
978  {
979  tool = new CostComplexityPruneTool();
980  }
981  else {
982  Log() << kFATAL << "Selected pruning method not yet implemented "
983  << Endl;
984  }
985 
986  if(!tool) return 0.0;
987 
988  tool->SetPruneStrength(GetPruneStrength());
989  if(tool->IsAutomatic()) {
990  if(validationSample == NULL){
991  Log() << kFATAL << "Cannot automate the pruning algorithm without an "
992  << "independent validation sample!" << Endl;
993  }else if(validationSample->size() == 0) {
994  Log() << kFATAL << "Cannot automate the pruning algorithm with "
995  << "independent validation sample of ZERO events!" << Endl;
996  }
997  }
998 
999  info = tool->CalculatePruningInfo(this,validationSample);
1000  Double_t pruneStrength=0;
1001  if(!info) {
1002  Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1003  << Endl;
1004  } else {
1005  pruneStrength = info->PruneStrength;
1006 
1007  // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1008  // << " has quality index " << info->QualityIndex << Endl;
1009 
1010 
1011  for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1012 
1013  PruneNode(info->PruneSequence[i]);
1014  }
1015  // update the number of nodes after the pruning
1016  this->CountNodes();
1017  }
1018 
1019  delete tool;
1020  delete info;
1021 
1022  return pruneStrength;
1023 };
1024 
1025 
1026 ////////////////////////////////////////////////////////////////////////////////
1027 /// run the validation sample through the (pruned) tree and fill in the nodes
1028 /// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1029 /// and Background events from the validation sample. This is then later used
1030 /// when asking for the "tree quality" ..
1031 
1032 void TMVA::DecisionTree::ApplyValidationSample( const EventConstList* validationSample ) const
1033 {
1034  GetRoot()->ResetValidationData();
1035  for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1036  CheckEventWithPrunedTree((*validationSample)[ievt]);
1037  }
1038 }
1039 
1040 ////////////////////////////////////////////////////////////////////////////////
1041 /// return the misclassification rate of a pruned tree
1042 /// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1043 /// any node, hence this tree quality testing will stop there, hence test
1044 /// the pruned tree (while the full tree is still in place for normal/later use)
1045 
1046 Double_t TMVA::DecisionTree::TestPrunedTreeQuality( const DecisionTreeNode* n, Int_t mode ) const
1047 {
1048  if (n == NULL) { // default, start at the tree top, then descend recursively
1049  n = this->GetRoot();
1050  if (n == NULL) {
1051  Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1052  return 0;
1053  }
1054  }
1055 
1056  if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1057  return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1058  TestPrunedTreeQuality( n->GetRight(), mode ));
1059  }
1060  else { // terminal leaf (in a pruned subtree of T_max at least)
1061  if (DoRegression()) {
1062  Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1063  return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1064  }
1065  else {
1066  if (mode == 0) {
1067  if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1068  return n->GetNBValidation();
1069  else
1070  return n->GetNSValidation();
1071  }
1072  else if ( mode == 1 ) {
1073  // calculate the weighted error using the pruning validation sample
1074  return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1075  }
1076  else {
1077  throw std::string("Unknown ValidationQualityMode");
1078  }
1079  }
1080  }
1081 }
1082 
1083 ////////////////////////////////////////////////////////////////////////////////
1084 /// pass a single validation event through a pruned decision tree
1085 /// on the way down the tree, fill in all the "intermediate" information
1086 /// that would normally be there from training.
1087 
1088 void TMVA::DecisionTree::CheckEventWithPrunedTree( const Event* e ) const
1089 {
1090  DecisionTreeNode* current = this->GetRoot();
1091  if (current == NULL) {
1092  Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1093  }
1094 
1095  while(current != NULL) {
1096  if(e->GetClass() == fSigClass)
1097  current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1098  else
1099  current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1100 
1101  if (e->GetNTargets() > 0) {
1102  current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1103  current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1104  }
1105 
1106  if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1107  current = NULL;
1108  }
1109  else {
1110  if (current->GoesRight(*e))
1111  current = (TMVA::DecisionTreeNode*)current->GetRight();
1112  else
1113  current = (TMVA::DecisionTreeNode*)current->GetLeft();
1114  }
1115  }
1116 }
1117 
1118 ////////////////////////////////////////////////////////////////////////////////
1119 /// calculate the normalization factor for a pruning validation sample
1120 
1121 Double_t TMVA::DecisionTree::GetSumWeights( const EventConstList* validationSample ) const
1122 {
1123  Double_t sumWeights = 0.0;
1124  for( EventConstList::const_iterator it = validationSample->begin();
1125  it != validationSample->end(); ++it ) {
1126  sumWeights += (*it)->GetWeight();
1127  }
1128  return sumWeights;
1129 }
1130 
1131 ////////////////////////////////////////////////////////////////////////////////
1132 /// return the number of terminal nodes in the sub-tree below Node n
1133 
1134 UInt_t TMVA::DecisionTree::CountLeafNodes( TMVA::Node *n )
1135 {
1136  if (n == NULL) { // default, start at the tree top, then descend recursively
1137  n = this->GetRoot();
1138  if (n == NULL) {
1139  Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1140  return 0;
1141  }
1142  }
1143 
1144  UInt_t countLeafs=0;
1145 
1146  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1147  countLeafs += 1;
1148  }
1149  else {
1150  if (this->GetLeftDaughter(n) != NULL) {
1151  countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1152  }
1153  if (this->GetRightDaughter(n) != NULL) {
1154  countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1155  }
1156  }
1157  return countLeafs;
1158 }
1159 
1160 ////////////////////////////////////////////////////////////////////////////////
1161 /// descend a tree to find all its leaf nodes
1162 
1163 void TMVA::DecisionTree::DescendTree( Node* n )
1164 {
1165  if (n == NULL) { // default, start at the tree top, then descend recursively
1166  n = this->GetRoot();
1167  if (n == NULL) {
1168  Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1169  return ;
1170  }
1171  }
1172 
1173  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1174  // do nothing
1175  }
1176  else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1177  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1178  return;
1179  }
1180  else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1181  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1182  return;
1183  }
1184  else {
1185  if (this->GetLeftDaughter(n) != NULL) {
1186  this->DescendTree( this->GetLeftDaughter(n) );
1187  }
1188  if (this->GetRightDaughter(n) != NULL) {
1189  this->DescendTree( this->GetRightDaughter(n) );
1190  }
1191  }
1192 }
1193 
1194 ////////////////////////////////////////////////////////////////////////////////
1195 /// prune away the subtree below the node
1196 
1197 void TMVA::DecisionTree::PruneNode( DecisionTreeNode* node )
1198 {
1199  DecisionTreeNode *l = node->GetLeft();
1200  DecisionTreeNode *r = node->GetRight();
1201 
1202  node->SetRight(NULL);
1203  node->SetLeft(NULL);
1204  node->SetSelector(-1);
1205  node->SetSeparationGain(-1);
1206  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1207  else node->SetNodeType(-1);
1208  this->DeleteNode(l);
1209  this->DeleteNode(r);
1210  // update the stored number of nodes in the Tree
1211  this->CountNodes();
1212 
1213 }
1214 
1215 ////////////////////////////////////////////////////////////////////////////////
1216 /// prune a node temporarily (without actually deleting its descendants
1217 /// which allows testing the pruned tree quality for many different
1218 /// pruning stages without "touching" the tree.
1219 
1220 void TMVA::DecisionTree::PruneNodeInPlace( DecisionTreeNode* node ) {
1221  if(node == NULL) return;
1222  node->SetNTerminal(1);
1223  node->SetSubTreeR( node->GetNodeR() );
1224  node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1225  node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1226  node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1227 }
1228 
1229 ////////////////////////////////////////////////////////////////////////////////
1230 /// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1231 /// is coded as a sequence of left-right moves starting from the root, coded as
1232 /// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1233 
1234 TMVA::Node* TMVA::DecisionTree::GetNode( ULong_t sequence, UInt_t depth )
1235 {
1236  Node* current = this->GetRoot();
1237 
1238  for (UInt_t i =0; i < depth; i++) {
1239  ULong_t tmp = 1 << i;
1240  if ( tmp & sequence) current = this->GetRightDaughter(current);
1241  else current = this->GetLeftDaughter(current);
1242  }
1243 
1244  return current;
1245 }
1246 
1247 ////////////////////////////////////////////////////////////////////////////////
1248 ///
1249 
1250 void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1251  for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1252  if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1253  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1254  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1255  }
1256  if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1257  else useNvars = fUseNvars;
1258 
1259  UInt_t nSelectedVars = 0;
1260  while (nSelectedVars < useNvars) {
1261  Double_t bla = fMyTrandom->Rndm()*fNvars;
1262  useVariable[Int_t (bla)] = kTRUE;
1263  nSelectedVars = 0;
1264  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1265  if (useVariable[ivar] == kTRUE) {
1266  mapVariable[nSelectedVars] = ivar;
1267  nSelectedVars++;
1268  }
1269  }
1270  }
1271  if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1272 }
1273 
1274 // Multithreaded version of DecisionTree::TrainNodeFast
1275 #ifdef R__USE_IMT
1276 //====================================================================================
1277 // Had to define this struct to enable parallelization in TrainNodeFast.
1278 // Using TrainNodeInfo, each thread can return the information needed.
1279 // After each thread returns we can then merge the results, hence the + operator.
1280 //====================================================================================
1281 struct TrainNodeInfo{
1282 
1283  TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1284 
1285  cNvars = cNvars_;
1286  nBins = nBins_;
1287 
1288  nSelS = std::vector< std::vector<Double_t> >(cNvars);
1289  nSelB = std::vector< std::vector<Double_t> >(cNvars);
1290  nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1291  nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1292  target = std::vector< std::vector<Double_t> >(cNvars);
1293  target2 = std::vector< std::vector<Double_t> >(cNvars);
1294 
1295  for(Int_t ivar=0; ivar<cNvars; ivar++){
1296  nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297  nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298  nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299  nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1300  target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1301  target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1302  }
1303  };
1304 
1305  TrainNodeInfo(){};
1306 
1307  // #### malloc problem if I define this and try to destruct xmin and xmax...
1308  // #### Maybe someone better at C++ can figure out why and fix this if it's a
1309  // ### serious memory problem
1310  //~TrainNodeInfo(){
1311  // delete [] xmin;
1312  // delete [] xmax;
1313  //};
1314 
1315  Int_t cNvars = 0;
1316  UInt_t* nBins;
1317 
1318  Double_t nTotS = 0;
1319  Double_t nTotS_unWeighted = 0;
1320  Double_t nTotB = 0;
1321  Double_t nTotB_unWeighted = 0;
1322 
1323  std::vector< std::vector<Double_t> > nSelS;
1324  std::vector< std::vector<Double_t> > nSelB;
1325  std::vector< std::vector<Double_t> > nSelS_unWeighted;
1326  std::vector< std::vector<Double_t> > nSelB_unWeighted;
1327  std::vector< std::vector<Double_t> > target;
1328  std::vector< std::vector<Double_t> > target2;
1329 
1330  // Define the addition operator for TrainNodeInfo
1331  // Make sure both TrainNodeInfos have the same nvars if we add them
1332  TrainNodeInfo operator+(const TrainNodeInfo& other)
1333  {
1334  TrainNodeInfo ret(cNvars, nBins);
1335 
1336  // check that the two are compatible to add
1337  if(cNvars != other.cNvars)
1338  {
1339  std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1340  return ret;
1341  }
1342 
1343  // add the signal, background, and target sums
1344  for (Int_t ivar=0; ivar<cNvars; ivar++) {
1345  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1346  ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1347  ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1348  ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1349  ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1350  ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1351  ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1352  }
1353  }
1354 
1355  ret.nTotS = nTotS + other.nTotS;
1356  ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1357  ret.nTotB = nTotB + other.nTotB;
1358  ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1359 
1360  return ret;
1361  };
1362 
1363 };
1364 //===========================================================================
1365 // Done with TrainNodeInfo declaration
1366 //===========================================================================
1367 
1368 ////////////////////////////////////////////////////////////////////////////////
1369 /// Decide how to split a node using one of the variables that gives
1370 /// the best separation of signal/background. In order to do this, for each
1371 /// variable a scan of the different cut values in a grid (grid = fNCuts) is
1372 /// performed and the resulting separation gains are compared.
1373 /// in addition to the individual variables, one can also ask for a fisher
1374 /// discriminant being built out of (some) of the variables and used as a
1375 /// possible multivariate split.
1376 
1377 Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1378  TMVA::DecisionTreeNode *node )
1379 {
1380  // #### OK let's comment this one to see how to parallelize it
1381  Double_t separationGainTotal = -1;
1382  Double_t *separationGain = new Double_t[fNvars+1];
1383  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1384 
1385  // #### initialize the sep gain and cut index values
1386  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1387  separationGain[ivar]=-1;
1388  cutIndex[ivar]=-1;
1389  }
1390  // ### set up some other variables
1391  Int_t mxVar = -1;
1392  Bool_t cutType = kTRUE;
1393  UInt_t nevents = eventSample.size();
1394 
1395 
1396  // the +1 comes from the fact that I treat later on the Fisher output as an
1397  // additional possible variable.
1398  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1399  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1400 
1401  std::vector<Double_t> fisherCoeff;
1402 
1403  // #### set up a map to the subset of variables using two arrays
1404  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1405  UInt_t tmp=fUseNvars;
1406  GetRandomisedVariables(useVariable,mapVariable,tmp);
1407  }
1408  else {
1409  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1410  useVariable[ivar] = kTRUE;
1411  mapVariable[ivar] = ivar;
1412  }
1413  }
1414  // #### last variable entry is the fisher variable
1415  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1416 
1417  // #### Begin Fisher calculation
1418  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1419  if (fUseFisherCuts) {
1420  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1421 
1422  //use for the Fisher discriminant ONLY those variables that show
1423  //some reasonable linear correlation in either Signal or Background
1424  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1425  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1426  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1427  useVarInFisher[ivar] = kFALSE;
1428  mapVarInFisher[ivar] = ivar;
1429  }
1430 
1431  std::vector<TMatrixDSym*>* covMatrices;
1432  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1433  if (!covMatrices){
1434  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1435  fisherOK = kFALSE;
1436  }else{
1437  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1438  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1439  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1440  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1441 
1442  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1443  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1444  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1445  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1446  useVarInFisher[ivar] = kTRUE;
1447  useVarInFisher[jvar] = kTRUE;
1448  }
1449  }
1450  }
1451  // now as you know which variables you want to use, count and map them:
1452  // such that you can use an array/matrix filled only with THOSE variables
1453  // that you used
1454  UInt_t nFisherVars = 0;
1455  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1456  //now .. pick those variables that are used in the FIsher and are also
1457  // part of the "allowed" variables in case of Randomized Trees)
1458  if (useVarInFisher[ivar] && useVariable[ivar]) {
1459  mapVarInFisher[nFisherVars++]=ivar;
1460  // now exclude the variables used in the Fisher cuts, and don't
1461  // use them anymore in the individual variable scan
1462  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1463  }
1464  }
1465 
1466 
1467  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1468  fisherOK = kTRUE;
1469  }
1470  delete [] useVarInFisher;
1471  delete [] mapVarInFisher;
1472 
1473  }
1474  // #### End Fisher calculation
1475 
1476 
1477  UInt_t cNvars = fNvars;
1478  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1479 
1480  // #### set up the binning info arrays
1481  // #### each var has its own binning since some may be integers
1482  UInt_t* nBins = new UInt_t [cNvars];
1483  Double_t* binWidth = new Double_t [cNvars];
1484  Double_t* invBinWidth = new Double_t [cNvars];
1485  Double_t** cutValues = new Double_t* [cNvars];
1486 
1487  // #### set up the xmin and xmax arrays
1488  // #### each var has its own range
1489  Double_t *xmin = new Double_t[cNvars];
1490  Double_t *xmax = new Double_t[cNvars];
1491 
1492  // construct and intialize binning/cuts
1493  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1494  // ncuts means that we need n+1 bins for each variable
1495  nBins[ivar] = fNCuts+1;
1496  if (ivar < fNvars) {
1497  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1498  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1499  }
1500  }
1501 
1502  cutValues[ivar] = new Double_t [nBins[ivar]];
1503  }
1504 
1505  // init the range and cutvalues for each var now that we know the binning
1506  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1507  if (ivar < fNvars){
1508  xmin[ivar]=node->GetSampleMin(ivar);
1509  xmax[ivar]=node->GetSampleMax(ivar);
1510  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1511  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1512  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1513  useVariable[ivar]=kFALSE;
1514  }
1515 
1516  } else { // the fisher variable
1517  xmin[ivar]=999;
1518  xmax[ivar]=-999;
1519  // too bad, for the moment I don't know how to do this without looping
1520  // once to get the "min max" and then AGAIN to fill the histogram
1521  for (UInt_t iev=0; iev<nevents; iev++) {
1522  // returns the Fisher value (no fixed range)
1523  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1524  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1525  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1526  if (result > xmax[ivar]) xmax[ivar]=result;
1527  if (result < xmin[ivar]) xmin[ivar]=result;
1528  }
1529  }
1530  // this loop could take a long time if nbins is large
1531  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1532  cutValues[ivar][ibin]=0;
1533  }
1534  }
1535 
1536  // ====================================================================
1537  // ====================================================================
1538  // Parallelized Version
1539  // ====================================================================
1540  // ====================================================================
1541 
1542  // #### Figure out the cut values, loops through vars then through cuts
1543  // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1544  // fill the cut values for the scan:
1545  auto varSeeds = ROOT::TSeqU(cNvars);
1546  auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1547 
1548  if ( useVariable[ivar] ) {
1549 
1550  //set the grid for the cut scan on the variables like this:
1551  //
1552  // | | | | | ... | |
1553  // xmin xmax
1554  //
1555  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1556  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1557  // --> nBins = fNCuts+1
1558  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1559  // hence can be safely omitted
1560 
1561  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1562  invBinWidth[ivar] = 1./binWidth[ivar];
1563  if (ivar < fNvars) {
1564  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1565  }
1566 
1567  // std::cout << "ivar="<<ivar
1568  // <<" min="<<xmin[ivar]
1569  // << " max="<<xmax[ivar]
1570  // << " width=" << istepSize
1571  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1572  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1573  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1574  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1575  }
1576  }
1577  return 0;
1578  };
1579  TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1580 
1581  // #### Loop through the events to get the total sig and background
1582  // #### Then loop through the vars to get the counts in each bin in each var
1583  // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1584 
1585  TrainNodeInfo nodeInfo(cNvars, nBins);
1586  UInt_t nPartitions = TMVA::Config::Instance().GetThreadExecutor().GetPoolSize();
1587 
1588  // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1589  // #### Parallelize by chunking the data into the same number of sections as we have processors
1590  if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1591  {
1592  auto seeds = ROOT::TSeqU(nPartitions);
1593 
1594  // need a lambda function to pass to TThreadExecutor::MapReduce
1595  auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1596  &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1597 
1598  UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1599  UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1600 
1601  TrainNodeInfo nodeInfof(cNvars, nBins);
1602 
1603  for(UInt_t iev=start; iev<end; iev++) {
1604 
1605  Double_t eventWeight = eventSample[iev]->GetWeight();
1606  if (eventSample[iev]->GetClass() == fSigClass) {
1607  nodeInfof.nTotS+=eventWeight;
1608  nodeInfof.nTotS_unWeighted++; }
1609  else {
1610  nodeInfof.nTotB+=eventWeight;
1611  nodeInfof.nTotB_unWeighted++;
1612  }
1613 
1614  // #### Count the number in each bin
1615  Int_t iBin=-1;
1616  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1617  // now scan trough the cuts for each varable and find which one gives
1618  // the best separationGain at the current stage.
1619  if ( useVariable[ivar] ) {
1620  Double_t eventData;
1621  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1622  else { // the fisher variable
1623  eventData = fisherCoeff[fNvars];
1624  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1625  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1626 
1627  }
1628  // #### figure out which bin it belongs in ...
1629  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1630  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1631  if (eventSample[iev]->GetClass() == fSigClass) {
1632  nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1633  nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1634  }
1635  else {
1636  nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1637  nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1638  }
1639  if (DoRegression()) {
1640  nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1641  nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1642  }
1643  }
1644  }
1645  }
1646  return nodeInfof;
1647  };
1648 
1649  // #### Need an intial struct to pass to std::accumulate
1650  TrainNodeInfo nodeInfoInit(cNvars, nBins);
1651 
1652  // #### Run the threads in parallel then merge the results
1653  auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1654  nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1655  }
1656 
1657 
1658  // #### When nbins is close to the order of the data this version of parallelization is faster
1659  // #### Parallelize by vectorizing the variable loop
1660  else {
1661 
1662  auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1663 
1664  for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1665 
1666  Int_t iBin=-1;
1667  Double_t eventWeight = eventSample[iev]->GetWeight();
1668 
1669  // Only count the net signal and background once
1670  if(ivar==0){
1671  if (eventSample[iev]->GetClass() == fSigClass) {
1672  nodeInfo.nTotS+=eventWeight;
1673  nodeInfo.nTotS_unWeighted++; }
1674  else {
1675  nodeInfo.nTotB+=eventWeight;
1676  nodeInfo.nTotB_unWeighted++;
1677  }
1678  }
1679 
1680  // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1681  if ( useVariable[ivar] ) {
1682  Double_t eventData;
1683  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1684  else { // the fisher variable
1685  eventData = fisherCoeff[fNvars];
1686  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1687  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1688 
1689  }
1690  // #### figure out which bin it belongs in ...
1691  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1692  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1693  if (eventSample[iev]->GetClass() == fSigClass) {
1694  nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1695  nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1696  }
1697  else {
1698  nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1699  nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1700  }
1701  if (DoRegression()) {
1702  nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1703  nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1704  }
1705  }
1706  }
1707  return 0;
1708  };
1709 
1710  TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1711  }
1712 
1713  // now turn each "histogram" into a cumulative distribution
1714  // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1715  // #### doesn't hurt otherwise, pretty unnoticeable
1716  auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1717  if (useVariable[ivar]) {
1718  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1719  nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1720  nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1721  nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1722  nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1723  if (DoRegression()) {
1724  nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1725  nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1726  }
1727  }
1728  if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1729  Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1730  << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1731  << " while eventsample size = " << eventSample.size()
1732  << Endl;
1733  }
1734  double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1735  double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1736  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1737  Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1738  << lastBins
1739  << " while total number of events = " << totalSum
1740  << Endl;
1741  }
1742  }
1743  return 0;
1744  };
1745  TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1746 
1747  // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1748  // now select the optimal cuts for each varable and find which one gives
1749  // the best separationGain at the current stage
1750  auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1751  if (useVariable[ivar]) {
1752  Double_t sepTmp;
1753  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1754  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1755  // calculated by the "SamplePurities" from the branches that would go to the
1756  // left or the right from this node if "these" cuts were used in the Node:
1757  // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1758  // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1759 
1760  // only allow splits where both daughter nodes match the specified minimum number
1761  // for this use the "unweighted" events, as you are interested in statistically
1762  // significant splits, which is determined by the actual number of entries
1763  // for a node, rather than the sum of event weights.
1764 
1765  Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1766  Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1767  Double_t s = nodeInfo.nTotS_unWeighted;
1768  Double_t b = nodeInfo.nTotB_unWeighted;
1769  Double_t slW = nodeInfo.nSelS[ivar][iBin];
1770  Double_t blW = nodeInfo.nSelB[ivar][iBin];
1771  Double_t sW = nodeInfo.nTotS;
1772  Double_t bW = nodeInfo.nTotB;
1773  Double_t sr = s-sl;
1774  Double_t br = b-bl;
1775  Double_t srW = sW-slW;
1776  Double_t brW = bW-blW;
1777  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1778  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1779  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1780  ) {
1781 
1782  if (DoRegression()) {
1783  sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1784  nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1785  nodeInfo.nTotS+nodeInfo.nTotB,
1786  nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1787  } else {
1788  sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1789  }
1790  if (separationGain[ivar] < sepTmp) {
1791  separationGain[ivar] = sepTmp;
1792  cutIndex[ivar] = iBin;
1793  }
1794  }
1795  }
1796  }
1797  return 0;
1798  };
1799  TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1800 
1801  // you found the best separation cut for each variable, now compare the variables
1802  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1803  if (useVariable[ivar] ) {
1804  if (separationGainTotal < separationGain[ivar]) {
1805  separationGainTotal = separationGain[ivar];
1806  mxVar = ivar;
1807  }
1808  }
1809  }
1810 
1811  if (mxVar >= 0) {
1812  if (DoRegression()) {
1813  node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1814  node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1815  if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1816  node->SetRMS(0);
1817 
1818  }else{
1819  node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1820  }
1821  }
1822  else {
1823  node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1824  if (mxVar >=0){
1825  if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1826  else cutType=kFALSE;
1827  }
1828  }
1829  node->SetSelector((UInt_t)mxVar);
1830  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1831  node->SetCutType(cutType);
1832  node->SetSeparationGain(separationGainTotal);
1833  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1834  node->SetNFisherCoeff(0);
1835  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1836  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1837  }else{
1838  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1839  // be even less storage space on average than storing also the mapping used otherwise
1840  // can always be changed relatively easy
1841  node->SetNFisherCoeff(fNvars+1);
1842  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1843  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1844  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1845  if (ivar<fNvars){
1846  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1847  }
1848  }
1849  }
1850  }
1851  else {
1852  separationGainTotal = 0;
1853  }
1854 
1855  // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1856  // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1857  // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1858  for (UInt_t i=0; i<cNvars; i++) {
1859  // delete [] nodeInfo.nSelS[i];
1860  // delete [] nodeInfo.nSelB[i];
1861  // delete [] nodeInfo.nSelS_unWeighted[i];
1862  // delete [] nodeInfo.nSelB_unWeighted[i];
1863  // delete [] nodeInfo.target[i];
1864  // delete [] nodeInfo.target2[i];
1865  delete [] cutValues[i];
1866  }
1867  //delete [] nodeInfo.nSelS;
1868  //delete [] nodeInfo.nSelB;
1869  //delete [] nodeInfo.nSelS_unWeighted;
1870  //delete [] nodeInfo.nSelB_unWeighted;
1871  //delete [] nodeInfo.target;
1872  //delete [] nodeInfo.target2;
1873 
1874  // #### left these as dynamic arrays as they were before
1875  // #### since I didn't need to mess with them for parallelization
1876  delete [] cutValues;
1877 
1878  delete [] xmin;
1879  delete [] xmax;
1880 
1881  delete [] useVariable;
1882  delete [] mapVariable;
1883 
1884  delete [] separationGain;
1885  delete [] cutIndex;
1886 
1887  delete [] nBins;
1888  delete [] binWidth;
1889  delete [] invBinWidth;
1890 
1891  return separationGainTotal;
1892 }
1893 
1894 // Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1895 #else
1896 Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1897  TMVA::DecisionTreeNode *node )
1898 {
1899 // #### OK let's comment this one to see how to parallelize it
1900  Double_t separationGainTotal = -1, sepTmp;
1901  Double_t *separationGain = new Double_t[fNvars+1];
1902  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1903 
1904  // #### initialize the sep gain and cut index values
1905  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1906  separationGain[ivar]=-1;
1907  cutIndex[ivar]=-1;
1908  }
1909  // ### set up some other variables
1910  Int_t mxVar = -1;
1911  Bool_t cutType = kTRUE;
1912  Double_t nTotS, nTotB;
1913  Int_t nTotS_unWeighted, nTotB_unWeighted;
1914  UInt_t nevents = eventSample.size();
1915 
1916 
1917  // the +1 comes from the fact that I treat later on the Fisher output as an
1918  // additional possible variable.
1919  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1920  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1921 
1922  std::vector<Double_t> fisherCoeff;
1923 
1924  // #### set up a map to the subset of variables using two arrays
1925  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1926  UInt_t tmp=fUseNvars;
1927  GetRandomisedVariables(useVariable,mapVariable,tmp);
1928  }
1929  else {
1930  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1931  useVariable[ivar] = kTRUE;
1932  mapVariable[ivar] = ivar;
1933  }
1934  }
1935  // #### last variable entry is the fisher variable
1936  useVariable[fNvars] = kFALSE; //by default fisher is not used..
1937 
1938  // #### Begin Fisher calculation
1939  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1940  if (fUseFisherCuts) {
1941  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1942 
1943  //use for the Fisher discriminant ONLY those variables that show
1944  //some reasonable linear correlation in either Signal or Background
1945  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1946  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1947  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1948  useVarInFisher[ivar] = kFALSE;
1949  mapVarInFisher[ivar] = ivar;
1950  }
1951 
1952  std::vector<TMatrixDSym*>* covMatrices;
1953  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1954  if (!covMatrices){
1955  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1956  fisherOK = kFALSE;
1957  }else{
1958  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1959  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1960  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1961  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1962 
1963  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1964  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1965  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1966  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1967  useVarInFisher[ivar] = kTRUE;
1968  useVarInFisher[jvar] = kTRUE;
1969  }
1970  }
1971  }
1972  // now as you know which variables you want to use, count and map them:
1973  // such that you can use an array/matrix filled only with THOSE variables
1974  // that you used
1975  UInt_t nFisherVars = 0;
1976  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1977  //now .. pick those variables that are used in the FIsher and are also
1978  // part of the "allowed" variables in case of Randomized Trees)
1979  if (useVarInFisher[ivar] && useVariable[ivar]) {
1980  mapVarInFisher[nFisherVars++]=ivar;
1981  // now exclud the the variables used in the Fisher cuts, and don't
1982  // use them anymore in the individual variable scan
1983  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1984  }
1985  }
1986 
1987 
1988  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1989  fisherOK = kTRUE;
1990  }
1991  delete [] useVarInFisher;
1992  delete [] mapVarInFisher;
1993 
1994  }
1995  // #### End Fisher calculation
1996 
1997 
1998  UInt_t cNvars = fNvars;
1999  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
2000 
2001  // #### OK now what's going on...
2002  // #### looks like we are setting up histograms
2003  UInt_t* nBins = new UInt_t [cNvars];
2004  Double_t* binWidth = new Double_t [cNvars];
2005  Double_t* invBinWidth = new Double_t [cNvars];
2006 
2007  Double_t** nSelS = new Double_t* [cNvars];
2008  Double_t** nSelB = new Double_t* [cNvars];
2009  Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2010  Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2011  Double_t** target = new Double_t* [cNvars];
2012  Double_t** target2 = new Double_t* [cNvars];
2013  Double_t** cutValues = new Double_t* [cNvars];
2014 
2015  // #### looping through the variables...
2016  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2017  // #### ncuts means that we need n+1 bins for each variable
2018  nBins[ivar] = fNCuts+1;
2019  if (ivar < fNvars) {
2020  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2021  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2022  }
2023  }
2024 
2025  // #### make some new arrays for each ith var, size=nbins for each array
2026  // #### integer features get the same number of bins as values, set later
2027  nSelS[ivar] = new Double_t [nBins[ivar]];
2028  nSelB[ivar] = new Double_t [nBins[ivar]];
2029  nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2030  nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2031  target[ivar] = new Double_t [nBins[ivar]];
2032  target2[ivar] = new Double_t [nBins[ivar]];
2033  cutValues[ivar] = new Double_t [nBins[ivar]];
2034 
2035  }
2036 
2037  // #### xmin and xmax for earch variable
2038  Double_t *xmin = new Double_t[cNvars];
2039  Double_t *xmax = new Double_t[cNvars];
2040 
2041  // #### ok loop through each variable to initialize all the values
2042  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2043  if (ivar < fNvars){
2044  xmin[ivar]=node->GetSampleMin(ivar);
2045  xmax[ivar]=node->GetSampleMax(ivar);
2046  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2047  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2048  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2049  useVariable[ivar]=kFALSE;
2050  }
2051 
2052  } else { // the fisher variable
2053  xmin[ivar]=999;
2054  xmax[ivar]=-999;
2055  // too bad, for the moment I don't know how to do this without looping
2056  // once to get the "min max" and then AGAIN to fill the histogram
2057  for (UInt_t iev=0; iev<nevents; iev++) {
2058  // returns the Fisher value (no fixed range)
2059  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2060  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2061  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2062  if (result > xmax[ivar]) xmax[ivar]=result;
2063  if (result < xmin[ivar]) xmin[ivar]=result;
2064  }
2065  }
2066  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2067  nSelS[ivar][ibin]=0;
2068  nSelB[ivar][ibin]=0;
2069  nSelS_unWeighted[ivar][ibin]=0;
2070  nSelB_unWeighted[ivar][ibin]=0;
2071  target[ivar][ibin]=0;
2072  target2[ivar][ibin]=0;
2073  cutValues[ivar][ibin]=0;
2074  }
2075  }
2076 
2077  // #### Nothing to parallelize here really, no loop through events
2078  // #### only figures out the bin edge values for the "histogram" arrays
2079  // fill the cut values for the scan:
2080  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2081 
2082  if ( useVariable[ivar] ) {
2083 
2084  //set the grid for the cut scan on the variables like this:
2085  //
2086  // | | | | | ... | |
2087  // xmin xmax
2088  //
2089  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2090  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2091  // --> nBins = fNCuts+1
2092  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2093  // hence can be safely omitted
2094 
2095  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2096  invBinWidth[ivar] = 1./binWidth[ivar];
2097  if (ivar < fNvars) {
2098  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2099  }
2100 
2101  // std::cout << "ivar="<<ivar
2102  // <<" min="<<xmin[ivar]
2103  // << " max="<<xmax[ivar]
2104  // << " widht=" << istepSize
2105  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2106  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2107  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2108  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2109  }
2110  }
2111  }
2112 
2113  // #### Loop through the events to get the total sig and background
2114  nTotS=0; nTotB=0;
2115  nTotS_unWeighted=0; nTotB_unWeighted=0;
2116  for (UInt_t iev=0; iev<nevents; iev++) {
2117 
2118  Double_t eventWeight = eventSample[iev]->GetWeight();
2119  if (eventSample[iev]->GetClass() == fSigClass) {
2120  nTotS+=eventWeight;
2121  nTotS_unWeighted++; }
2122  else {
2123  nTotB+=eventWeight;
2124  nTotB_unWeighted++;
2125  }
2126 
2127  // #### Count the number in each bin (fill array "histograms")
2128  Int_t iBin=-1;
2129  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2130  // now scan trough the cuts for each varable and find which one gives
2131  // the best separationGain at the current stage.
2132  if ( useVariable[ivar] ) {
2133  Double_t eventData;
2134  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2135  else { // the fisher variable
2136  eventData = fisherCoeff[fNvars];
2137  for (UInt_t jvar=0; jvar<fNvars; jvar++)
2138  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2139 
2140  }
2141  // #### figure out which bin the event belongs in ...
2142  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2143  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2144  if (eventSample[iev]->GetClass() == fSigClass) {
2145  nSelS[ivar][iBin]+=eventWeight;
2146  nSelS_unWeighted[ivar][iBin]++;
2147  }
2148  else {
2149  nSelB[ivar][iBin]+=eventWeight;
2150  nSelB_unWeighted[ivar][iBin]++;
2151  }
2152  if (DoRegression()) {
2153  target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2154  target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2155  }
2156  }
2157  }
2158  }
2159 
2160  // now turn the "histogram" into a cumulative distribution
2161  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2162  if (useVariable[ivar]) {
2163  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2164  nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2165  nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2166  nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2167  nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2168  if (DoRegression()) {
2169  target[ivar][ibin] +=target[ivar][ibin-1] ;
2170  target2[ivar][ibin]+=target2[ivar][ibin-1];
2171  }
2172  }
2173  if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2174  Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2175  << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2176  << " while eventsample size = " << eventSample.size()
2177  << Endl;
2178  }
2179  double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2180  double totalSum=nTotS+nTotB;
2181  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2182  Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2183  << lastBins
2184  << " while total number of events = " << totalSum
2185  << Endl;
2186  }
2187  }
2188  }
2189  // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2190  // now select the optimal cuts for each varable and find which one gives
2191  // the best separationGain at the current stage
2192  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2193  if (useVariable[ivar]) {
2194  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2195  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2196  // calculated by the "SamplePurities" fom the branches that would go to the
2197  // left or the right from this node if "these" cuts were used in the Node:
2198  // hereby: nSelS and nSelB would go to the right branch
2199  // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2200 
2201  // only allow splits where both daughter nodes match the specified miniumum number
2202  // for this use the "unweighted" events, as you are interested in statistically
2203  // significant splits, which is determined by the actual number of entries
2204  // for a node, rather than the sum of event weights.
2205 
2206  Double_t sl = nSelS_unWeighted[ivar][iBin];
2207  Double_t bl = nSelB_unWeighted[ivar][iBin];
2208  Double_t s = nTotS_unWeighted;
2209  Double_t b = nTotB_unWeighted;
2210  Double_t slW = nSelS[ivar][iBin];
2211  Double_t blW = nSelB[ivar][iBin];
2212  Double_t sW = nTotS;
2213  Double_t bW = nTotB;
2214  Double_t sr = s-sl;
2215  Double_t br = b-bl;
2216  Double_t srW = sW-slW;
2217  Double_t brW = bW-blW;
2218  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2219  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2220  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2221  ) {
2222 
2223  if (DoRegression()) {
2224  sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2225  target[ivar][iBin],target2[ivar][iBin],
2226  nTotS+nTotB,
2227  target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2228  } else {
2229  sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2230  }
2231  if (separationGain[ivar] < sepTmp) {
2232  separationGain[ivar] = sepTmp;
2233  cutIndex[ivar] = iBin;
2234  }
2235  }
2236  }
2237  }
2238  }
2239 
2240  //now you have found the best separation cut for each variable, now compare the variables
2241  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2242  if (useVariable[ivar] ) {
2243  if (separationGainTotal < separationGain[ivar]) {
2244  separationGainTotal = separationGain[ivar];
2245  mxVar = ivar;
2246  }
2247  }
2248  }
2249 
2250  if (mxVar >= 0) {
2251  if (DoRegression()) {
2252  node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2253  node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2254  if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2255  node->SetRMS(0);
2256  }else{
2257  node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2258  }
2259  }
2260  else {
2261  node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2262  if (mxVar >=0){
2263  if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2264  else cutType=kFALSE;
2265  }
2266  }
2267  node->SetSelector((UInt_t)mxVar);
2268  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2269  node->SetCutType(cutType);
2270  node->SetSeparationGain(separationGainTotal);
2271  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2272  node->SetNFisherCoeff(0);
2273  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2274  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2275  }else{
2276  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2277  // be even less storage space on average than storing also the mapping used otherwise
2278  // can always be changed relatively easy
2279  node->SetNFisherCoeff(fNvars+1);
2280  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2281  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2282  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2283  if (ivar<fNvars){
2284  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2285  }
2286  }
2287  }
2288  }
2289  else {
2290  separationGainTotal = 0;
2291  }
2292  // if (mxVar > -1) {
2293  // std::cout << "------------------------------------------------------------------"<<std::endl;
2294  // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2295  // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2296  // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2297  // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2298  // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2299  // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2300  // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2301  // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2302  // } else {
2303  // std::cout << "------------------------------------------------------------------"<<std::endl;
2304  // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2305  // }
2306  for (UInt_t i=0; i<cNvars; i++) {
2307  delete [] nSelS[i];
2308  delete [] nSelB[i];
2309  delete [] nSelS_unWeighted[i];
2310  delete [] nSelB_unWeighted[i];
2311  delete [] target[i];
2312  delete [] target2[i];
2313  delete [] cutValues[i];
2314  }
2315  delete [] nSelS;
2316  delete [] nSelB;
2317  delete [] nSelS_unWeighted;
2318  delete [] nSelB_unWeighted;
2319  delete [] target;
2320  delete [] target2;
2321  delete [] cutValues;
2322 
2323  delete [] xmin;
2324  delete [] xmax;
2325 
2326  delete [] useVariable;
2327  delete [] mapVariable;
2328 
2329  delete [] separationGain;
2330  delete [] cutIndex;
2331 
2332  delete [] nBins;
2333  delete [] binWidth;
2334  delete [] invBinWidth;
2335 
2336  return separationGainTotal;
2337 
2338 }
2339 #endif
2340 
2341 
2342 ////////////////////////////////////////////////////////////////////////////////
2343 /// calculate the fisher coefficients for the event sample and the variables used
2344 
2345 std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2346  std::vector<Double_t> fisherCoeff(fNvars+1);
2347 
2348  // initialization of global matrices and vectors
2349  // average value of each variables for S, B, S+B
2350  TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2351 
2352  // the covariance 'within class' and 'between class' matrices
2353  TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2354  TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2355  TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2356 
2357  //
2358  // compute mean values of variables in each sample, and the overall means
2359  //
2360 
2361  // initialize internal sum-of-weights variables
2362  Double_t sumOfWeightsS = 0;
2363  Double_t sumOfWeightsB = 0;
2364 
2365 
2366  // init vectors
2367  Double_t* sumS = new Double_t[nFisherVars];
2368  Double_t* sumB = new Double_t[nFisherVars];
2369  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2370 
2371  UInt_t nevents = eventSample.size();
2372  // compute sample means
2373  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2374 
2375  // read the Training Event into "event"
2376  const Event * ev = eventSample[ievt];
2377 
2378  // sum of weights
2379  Double_t weight = ev->GetWeight();
2380  if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2381  else sumOfWeightsB += weight;
2382 
2383  Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2384  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2385  sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2386  }
2387  }
2388  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2389  (*meanMatx)( ivar, 2 ) = sumS[ivar];
2390  (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2391 
2392  (*meanMatx)( ivar, 2 ) += sumB[ivar];
2393  (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2394 
2395  // signal + background
2396  (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2397  }
2398 
2399  delete [] sumS;
2400 
2401  delete [] sumB;
2402 
2403  // the matrix of covariance 'within class' reflects the dispersion of the
2404  // events relative to the center of gravity of their own class
2405 
2406  // assert required
2407 
2408  assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2409 
2410  // product matrices (x-<x>)(y-<y>) where x;y are variables
2411 
2412  const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2413  Double_t *sum2Sig = new Double_t[nFisherVars2];
2414  Double_t *sum2Bgd = new Double_t[nFisherVars2];
2415  Double_t *xval = new Double_t[nFisherVars2];
2416  memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2417  memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2418 
2419  // 'within class' covariance
2420  for (UInt_t ievt=0; ievt<nevents; ievt++) {
2421 
2422  // read the Training Event into "event"
2423  // const Event* ev = eventSample[ievt];
2424  const Event* ev = eventSample.at(ievt);
2425 
2426  Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2427 
2428  for (UInt_t x=0; x<nFisherVars; x++) {
2429  xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2430  }
2431  Int_t k=0;
2432  for (UInt_t x=0; x<nFisherVars; x++) {
2433  for (UInt_t y=0; y<nFisherVars; y++) {
2434  if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2435  else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2436  k++;
2437  }
2438  }
2439  }
2440  Int_t k=0;
2441  for (UInt_t x=0; x<nFisherVars; x++) {
2442  for (UInt_t y=0; y<nFisherVars; y++) {
2443  (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2444  k++;
2445  }
2446  }
2447 
2448  delete [] sum2Sig;
2449  delete [] sum2Bgd;
2450  delete [] xval;
2451 
2452 
2453  // the matrix of covariance 'between class' reflects the dispersion of the
2454  // events of a class relative to the global center of gravity of all the class
2455  // hence the separation between classes
2456 
2457 
2458  Double_t prodSig, prodBgd;
2459 
2460  for (UInt_t x=0; x<nFisherVars; x++) {
2461  for (UInt_t y=0; y<nFisherVars; y++) {
2462 
2463  prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2464  ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2465  prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2466  ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2467 
2468  (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2469  }
2470  }
2471 
2472 
2473 
2474  // compute full covariance matrix from sum of within and between matrices
2475  for (UInt_t x=0; x<nFisherVars; x++)
2476  for (UInt_t y=0; y<nFisherVars; y++)
2477  (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2478 
2479  // Fisher = Sum { [coeff]*[variables] }
2480  //
2481  // let Xs be the array of the mean values of variables for signal evts
2482  // let Xb be the array of the mean values of variables for backgd evts
2483  // let InvWith be the inverse matrix of the 'within class' correlation matrix
2484  //
2485  // then the array of Fisher coefficients is
2486  // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2487  TMatrixD* theMat = with; // Fishers original
2488  // TMatrixD* theMat = cov; // Mahalanobis
2489 
2490  TMatrixD invCov( *theMat );
2491  if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2492  Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2493  << TMath::Abs(invCov.Determinant())
2494  << " did you use the variables that are linear combinations or highly correlated?"
2495  << Endl;
2496  }
2497  if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2498  Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2499  << TMath::Abs(invCov.Determinant())
2500  << " did you use the variables that are linear combinations?"
2501  << Endl;
2502  }
2503 
2504  invCov.Invert();
2505 
2506  // apply rescaling factor
2507  Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2508 
2509  // compute difference of mean values
2510  std::vector<Double_t> diffMeans( nFisherVars );
2511 
2512  for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2513  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2514  for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2515  Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2516  fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2517  }
2518 
2519  // rescale
2520  fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2521  }
2522 
2523  // offset correction
2524  Double_t f0 = 0.0;
2525  for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2526  f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2527  }
2528  f0 /= -2.0;
2529 
2530  fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2531 
2532  return fisherCoeff;
2533 }
2534 
2535 ////////////////////////////////////////////////////////////////////////////////
2536 /// train a node by finding the single optimal cut for a single variable
2537 /// that best separates signal and background (maximizes the separation gain)
2538 
2539 Double_t TMVA::DecisionTree::TrainNodeFull( const EventConstList & eventSample,
2540  TMVA::DecisionTreeNode *node )
2541 {
2542  Double_t nTotS = 0.0, nTotB = 0.0;
2543  Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2544 
2545  std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2546 
2547  // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2548  // each spot in parallel no problem
2549  std::vector<Double_t> lCutValue( fNvars, 0.0 );
2550  std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2551  std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2552  lCutType.assign( fNvars, Char_t(kFALSE) );
2553 
2554  // Initialize (un)weighted counters for signal & background
2555  // Construct a list of event wrappers that point to the original data
2556  for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2557  if((*it)->GetClass() == fSigClass) { // signal or background event
2558  nTotS += (*it)->GetWeight();
2559  ++nTotS_unWeighted;
2560  }
2561  else {
2562  nTotB += (*it)->GetWeight();
2563  ++nTotB_unWeighted;
2564  }
2565  bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2566  }
2567 
2568  std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2569  useVariable.assign( fNvars, Char_t(kTRUE) );
2570 
2571  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2572  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2573  if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2574  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2575  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2576  }
2577  Int_t nSelectedVars = 0;
2578  while (nSelectedVars < fUseNvars) {
2579  Double_t bla = fMyTrandom->Rndm()*fNvars;
2580  useVariable[Int_t (bla)] = Char_t(kTRUE);
2581  nSelectedVars = 0;
2582  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2583  if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2584  }
2585  }
2586  }
2587  else {
2588  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2589  }
2590  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2591  if(!useVariable[ivar]) continue; // only optimze with selected variables
2592 
2593  TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2594 
2595  std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2596 
2597 
2598  Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2599 
2600  std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2601  for( ; it != it_end; ++it ) {
2602  if((**it)->GetClass() == fSigClass ) // specify signal or background event
2603  sigWeightCtr += (**it)->GetWeight();
2604  else
2605  bkgWeightCtr += (**it)->GetWeight();
2606  // Store the accumulated signal (background) weights
2607  it->SetCumulativeWeight(false,bkgWeightCtr);
2608  it->SetCumulativeWeight(true,sigWeightCtr);
2609  }
2610 
2611  const Double_t fPMin = 1.0e-6;
2612  Bool_t cutType = kFALSE;
2613  Long64_t index = 0;
2614  Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2615 
2616  // Locate the optimal cut for this (ivar-th) variable
2617  for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2618  if( index == 0 ) { ++index; continue; }
2619  if( *(*it) == NULL ) {
2620  Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2621  << index << ", and parent node=" << node->GetParent() << Endl;
2622  break;
2623  }
2624  dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2625  norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2626  // Only allow splits where both daughter nodes have the specified minimum number of events
2627  // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2628  if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2629 
2630  sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2631  if( sepTmp > separationGain ) {
2632  separationGain = sepTmp;
2633  cutValue = it->GetVal() - 0.5*dVal;
2634  Double_t nSelS = it->GetCumulativeWeight(true);
2635  Double_t nSelB = it->GetCumulativeWeight(false);
2636  // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2637  // or by removing signal (enhancing background)
2638  if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2639  else cutType = kFALSE;
2640  }
2641  }
2642  ++index;
2643  }
2644  lCutType[ivar] = Char_t(cutType);
2645  lCutValue[ivar] = cutValue;
2646  lSepGain[ivar] = separationGain;
2647  }
2648  Double_t separationGain = -1.0;
2649  Int_t iVarIndex = -1;
2650  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2651  if( lSepGain[ivar] > separationGain ) {
2652  iVarIndex = ivar;
2653  separationGain = lSepGain[ivar];
2654  }
2655  }
2656 
2657  // #### storing the best values into the node
2658  if(iVarIndex >= 0) {
2659  node->SetSelector(iVarIndex);
2660  node->SetCutValue(lCutValue[iVarIndex]);
2661  node->SetSeparationGain(lSepGain[iVarIndex]);
2662  node->SetCutType(lCutType[iVarIndex]);
2663  fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2664  }
2665  else {
2666  separationGain = 0.0;
2667  }
2668 
2669  return separationGain;
2670 }
2671 
2672 ////////////////////////////////////////////////////////////////////////////////
2673 /// get the pointer to the leaf node where a particular event ends up in...
2674 /// (used in gradient boosting)
2675 
2676 TMVA::DecisionTreeNode* TMVA::DecisionTree::GetEventNode(const TMVA::Event & e) const
2677 {
2678  TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2679  while(current->GetNodeType() == 0) { // intermediate node in a tree
2680  current = (current->GoesRight(e)) ?
2681  (TMVA::DecisionTreeNode*)current->GetRight() :
2682  (TMVA::DecisionTreeNode*)current->GetLeft();
2683  }
2684  return current;
2685 }
2686 
2687 ////////////////////////////////////////////////////////////////////////////////
2688 /// the event e is put into the decision tree (starting at the root node)
2689 /// and the output is NodeType (signal) or (background) of the final node (basket)
2690 /// in which the given events ends up. I.e. the result of the classification if
2691 /// the event for this decision tree.
2692 
2693 Double_t TMVA::DecisionTree::CheckEvent( const TMVA::Event * e, Bool_t UseYesNoLeaf ) const
2694 {
2695  TMVA::DecisionTreeNode *current = this->GetRoot();
2696  if (!current){
2697  Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2698  return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2699  }
2700 
2701  while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2702  current = (current->GoesRight(*e)) ?
2703  current->GetRight() :
2704  current->GetLeft();
2705  if (!current) {
2706  Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2707  }
2708 
2709  }
2710 
2711  if (DoRegression()) {
2712  // Note: This path is also taken for MethodBDT with analysis type
2713  // kClassification and kMulticlass when using GradBoost.
2714  // See TMVA::MethodBDT::InitGradBoost
2715  return current->GetResponse();
2716  } else {
2717  if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2718  else return current->GetPurity();
2719  }
2720 }
2721 
2722 ////////////////////////////////////////////////////////////////////////////////
2723 /// calculates the purity S/(S+B) of a given event sample
2724 
2725 Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2726 {
2727  Double_t sumsig=0, sumbkg=0, sumtot=0;
2728  for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2729  if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2730  else sumsig+=eventSample[ievt]->GetWeight();
2731  sumtot+=eventSample[ievt]->GetWeight();
2732  }
2733  // sanity check
2734  if (sumtot!= (sumsig+sumbkg)){
2735  Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2736  << sumtot << " " << sumsig << " " << sumbkg << Endl;
2737  }
2738  if (sumtot>0) return sumsig/(sumsig + sumbkg);
2739  else return -1;
2740 }
2741 
2742 ////////////////////////////////////////////////////////////////////////////////
2743 /// Return the relative variable importance, normalized to all
2744 /// variables together having the importance 1. The importance in
2745 /// evaluated as the total separation-gain that this variable had in
2746 /// the decision trees (weighted by the number of events)
2747 
2748 vector< Double_t > TMVA::DecisionTree::GetVariableImportance()
2749 {
2750  std::vector<Double_t> relativeImportance(fNvars);
2751  Double_t sum=0;
2752  for (UInt_t i=0; i< fNvars; i++) {
2753  sum += fVariableImportance[i];
2754  relativeImportance[i] = fVariableImportance[i];
2755  }
2756 
2757  for (UInt_t i=0; i< fNvars; i++) {
2758  if (sum > std::numeric_limits<double>::epsilon())
2759  relativeImportance[i] /= sum;
2760  else
2761  relativeImportance[i] = 0;
2762  }
2763  return relativeImportance;
2764 }
2765 
2766 ////////////////////////////////////////////////////////////////////////////////
2767 /// returns the relative importance of variable ivar
2768 
2769 Double_t TMVA::DecisionTree::GetVariableImportance( UInt_t ivar )
2770 {
2771  std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2772  if (ivar < fNvars) return relativeImportance[ivar];
2773  else {
2774  Log() << kFATAL << "<GetVariableImportance>" << Endl
2775  << "--- ivar = " << ivar << " is out of range " << Endl;
2776  }
2777 
2778  return -1;
2779 }
2780