Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
MethodBDT.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodBDT (Boosted Decision Trees) *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Analysis of Boosted Decision Trees *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Doug Schouten <dschoute@sfu.ca> - Simon Fraser U., Canada *
18  * Jan Therhaag <jan.therhaag@cern.ch> - U. of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://tmva.sourceforge.net/LICENSE) *
29  **********************************************************************************/
30 
31 #ifndef ROOT_TMVA_MethodBDT
32 #define ROOT_TMVA_MethodBDT
33 
34 //////////////////////////////////////////////////////////////////////////
35 // //
36 // MethodBDT //
37 // //
38 // Analysis of Boosted Decision Trees //
39 // //
40 //////////////////////////////////////////////////////////////////////////
41 
42 #include <vector>
43 #include <memory>
44 #include "TH2.h"
45 #include "TTree.h"
46 #include "TMVA/MethodBase.h"
47 #include "TMVA/DecisionTree.h"
48 #include "TMVA/Event.h"
49 #include "TMVA/LossFunction.h"
50 
51 // Multithreading only if the compilation flag is turned on
52 #ifdef R__USE_IMT
53 #include <ROOT/TThreadExecutor.hxx>
54 #include "TSystem.h"
55 #endif
56 
57 namespace TMVA {
58 
59  class SeparationBase;
60 
61  class MethodBDT : public MethodBase {
62 
63  public:
64 
65  // constructor for training and reading
66  MethodBDT( const TString& jobName,
67  const TString& methodTitle,
68  DataSetInfo& theData,
69  const TString& theOption = "");
70 
71  // constructor for calculating BDT-MVA using previously generatad decision trees
72  MethodBDT( DataSetInfo& theData,
73  const TString& theWeightFile);
74 
75  virtual ~MethodBDT( void );
76 
77  virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets );
78 
79 
80  // write all Events from the Tree into a vector of Events, that are
81  // more easily manipulated
82  void InitEventSample();
83 
84  // optimize tuning parameters
85  virtual std::map<TString,Double_t> OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA");
86  virtual void SetTuneParameters(std::map<TString,Double_t> tuneParameters);
87 
88  // training method
89  void Train( void );
90 
91  // revoke training
92  void Reset( void );
93 
94  using MethodBase::ReadWeightsFromStream;
95 
96  // write weights to file
97  void AddWeightsXMLTo( void* parent ) const;
98 
99  // read weights from file
100  void ReadWeightsFromStream( std::istream& istr );
101  void ReadWeightsFromXML(void* parent);
102 
103  // write method specific histos to target file
104  void WriteMonitoringHistosToFile( void ) const;
105 
106  // calculate the MVA value
107  Double_t GetMvaValue( Double_t* err = 0, Double_t* errUpper = 0);
108 
109  // get the actual forest size (might be less than fNTrees, the requested one, if boosting is stopped early
110  UInt_t GetNTrees() const {return fForest.size();}
111  private:
112 
113  Double_t GetMvaValue( Double_t* err, Double_t* errUpper, UInt_t useNTrees );
114  Double_t PrivateGetMvaValue( const TMVA::Event *ev, Double_t* err=0, Double_t* errUpper=0, UInt_t useNTrees=0 );
115  void BoostMonitor(Int_t iTree);
116 
117  public:
118  const std::vector<Float_t>& GetMulticlassValues();
119 
120  // regression response
121  const std::vector<Float_t>& GetRegressionValues();
122 
123  // apply the boost algorithm to a tree in the collection
124  Double_t Boost( std::vector<const TMVA::Event*>&, DecisionTree *dt, UInt_t cls = 0);
125 
126  // ranking of input variables
127  const Ranking* CreateRanking();
128 
129  // the option handling methods
130  void DeclareOptions();
131  void ProcessOptions();
132  void SetMaxDepth(Int_t d){fMaxDepth = d;}
133  void SetMinNodeSize(Double_t sizeInPercent);
134  void SetMinNodeSize(TString sizeInPercent);
135 
136  void SetNTrees(Int_t d){fNTrees = d;}
137  void SetAdaBoostBeta(Double_t b){fAdaBoostBeta = b;}
138  void SetNodePurityLimit(Double_t l){fNodePurityLimit = l;}
139  void SetShrinkage(Double_t s){fShrinkage = s;}
140  void SetUseNvars(Int_t n){fUseNvars = n;}
141  void SetBaggedSampleFraction(Double_t f){fBaggedSampleFraction = f;}
142 
143 
144  // get the forest
145  inline const std::vector<TMVA::DecisionTree*> & GetForest() const;
146 
147  // get the forest
148  inline const std::vector<const TMVA::Event*> & GetTrainingEvents() const;
149 
150  inline const std::vector<double> & GetBoostWeights() const;
151 
152  //return the individual relative variable importance
153  std::vector<Double_t> GetVariableImportance();
154  Double_t GetVariableImportance(UInt_t ivar);
155 
156  Double_t TestTreeQuality( DecisionTree *dt );
157 
158  // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
159  void MakeClassSpecific( std::ostream&, const TString& ) const;
160 
161  // header and auxiliary classes
162  void MakeClassSpecificHeader( std::ostream&, const TString& ) const;
163 
164  void MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout,
165  const TString& className ) const;
166 
167  void GetHelpMessage() const;
168 
169  protected:
170  void DeclareCompatibilityOptions();
171 
172  private:
173  // Init used in the various constructors
174  void Init( void );
175 
176  void PreProcessNegativeEventWeights();
177 
178  // boosting algorithm (adaptive boosting)
179  Double_t AdaBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
180 
181  // boosting algorithm (adaptive boosting with cost matrix)
182  Double_t AdaCost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
183 
184  // boosting as a random re-weighting
185  Double_t Bagging( );
186 
187  // boosting special for regression
188  Double_t RegBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
189 
190  // adaboost adapted to regression
191  Double_t AdaBoostR2( std::vector<const TMVA::Event*>&, DecisionTree *dt );
192 
193  // binomial likelihood gradient boost for classification
194  // (see Friedman: "Greedy Function Approximation: a Gradient Boosting Machine"
195  // Technical report, Dept. of Statistics, Stanford University)
196  Double_t GradBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt, UInt_t cls = 0);
197  Double_t GradBoostRegression(std::vector<const TMVA::Event*>&, DecisionTree *dt );
198  void InitGradBoost( std::vector<const TMVA::Event*>&);
199  void UpdateTargets( std::vector<const TMVA::Event*>&, UInt_t cls = 0);
200  void UpdateTargetsRegression( std::vector<const TMVA::Event*>&,Bool_t first=kFALSE);
201  Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees);
202  void GetBaggedSubSample(std::vector<const TMVA::Event*>&);
203 
204  std::vector<const TMVA::Event*> fEventSample; // the training events
205  std::vector<const TMVA::Event*> fValidationSample;// the Validation events
206  std::vector<const TMVA::Event*> fSubSample; // subsample for bagged grad boost
207  std::vector<const TMVA::Event*> *fTrainSample; // pointer to sample actually used in training (fEventSample or fSubSample) for example
208 
209  Int_t fNTrees; // number of decision trees requested
210  std::vector<DecisionTree*> fForest; // the collection of decision trees
211  std::vector<double> fBoostWeights; // the weights applied in the individual boosts
212  Double_t fSigToBkgFraction;// Signal to Background fraction assumed during training
213  TString fBoostType; // string specifying the boost type
214  Double_t fAdaBoostBeta; // beta parameter for AdaBoost algorithm
215  TString fAdaBoostR2Loss; // loss type used in AdaBoostR2 (Linear,Quadratic or Exponential)
216  //Double_t fTransitionPoint; // break-down point for gradient regression
217  Double_t fShrinkage; // learning rate for gradient boost;
218  Bool_t fBaggedBoost; // turn bagging in combination with boost on/off
219  Bool_t fBaggedGradBoost; // turn bagging in combination with grad boost on/off
220  //Double_t fSumOfWeights; // sum of all event weights
221  //std::map< const TMVA::Event*, std::pair<Double_t, Double_t> > fWeightedResiduals; // weighted regression residuals
222  std::map< const TMVA::Event*, LossFunctionEventInfo> fLossFunctionEventInfo; // map event to true value, predicted value, and weight
223  // used by different loss functions for BDT regression
224  std::map< const TMVA::Event*,std::vector<double> > fResiduals; // individual event residuals for gradient boost
225 
226  //options for the decision Tree
227  SeparationBase *fSepType; // the separation used in node splitting
228  TString fSepTypeS; // the separation (option string) used in node splitting
229  Int_t fMinNodeEvents; // min number of events in node
230  Float_t fMinNodeSize; // min percentage of training events in node
231  TString fMinNodeSizeS; // string containing min percentage of training events in node
232 
233  Int_t fNCuts; // grid used in cut applied in node splitting
234  Bool_t fUseFisherCuts; // use multivariate splits using the Fisher criterium
235  Double_t fMinLinCorrForFisher; // the minimum linear correlation between two variables demanded for use in fisher criterium in node splitting
236  Bool_t fUseExclusiveVars; // individual variables already used in fisher criterium are not anymore analysed individually for node splitting
237  Bool_t fUseYesNoLeaf; // use sig or bkg classification in leave nodes or sig/bkg
238  Double_t fNodePurityLimit; // purity limit for sig/bkg nodes
239  UInt_t fNNodesMax; // max # of nodes
240  UInt_t fMaxDepth; // max depth
241 
242  DecisionTree::EPruneMethod fPruneMethod; // method used for prunig
243  TString fPruneMethodS; // prune method option String
244  Double_t fPruneStrength; // a parameter to set the "amount" of pruning..needs to be adjusted
245  Double_t fFValidationEvents; // fraction of events to use for pruning
246  Bool_t fAutomatic; // use user given prune strength or automatically determined one using a validation sample
247  Bool_t fRandomisedTrees; // choose a random subset of possible cut variables at each node during training
248  UInt_t fUseNvars; // the number of variables used in the randomised tree splitting
249  Bool_t fUsePoissonNvars; // use "fUseNvars" not as fixed number but as mean of a possion distr. in each split
250  UInt_t fUseNTrainEvents; // number of randomly picked training events used in randomised (and bagged) trees
251 
252  Double_t fBaggedSampleFraction; // relative size of bagged event sample to original sample size
253  TString fNegWeightTreatment; // variable that holds the option of how to treat negative event weights in training
254  Bool_t fNoNegWeightsInTraining; // ignore negative event weights in the training
255  Bool_t fInverseBoostNegWeights; // boost ev. with neg. weights with 1/boostweight rathre than boostweight
256  Bool_t fPairNegWeightsGlobal; // pair ev. with neg. and pos. weights in traning sample and "annihilate" them
257  Bool_t fTrainWithNegWeights; // yes there are negative event weights and we don't ignore them
258  Bool_t fDoBoostMonitor; //create control plot with ROC integral vs tree number
259 
260 
261  //some histograms for monitoring
262  TTree* fMonitorNtuple; // monitoring ntuple
263  Int_t fITree; // ntuple var: ith tree
264  Double_t fBoostWeight; // ntuple var: boost weight
265  Double_t fErrorFraction; // ntuple var: misclassification error fraction
266 
267  Double_t fCss; // Cost factor
268  Double_t fCts_sb; // Cost factor
269  Double_t fCtb_ss; // Cost factor
270  Double_t fCbb; // Cost factor
271 
272  Bool_t fDoPreselection; // do or do not perform automatic pre-selection of 100% eff. cuts
273 
274  Bool_t fSkipNormalization; // true for skipping normalization at initialization of trees
275 
276  std::vector<Double_t> fVariableImportance; // the relative importance of the different variables
277 
278 
279  void DeterminePreselectionCuts(const std::vector<const TMVA::Event*>& eventSample);
280  Double_t ApplyPreselectionCuts(const Event* ev);
281 
282  std::vector<Double_t> fLowSigCut;
283  std::vector<Double_t> fLowBkgCut;
284  std::vector<Double_t> fHighSigCut;
285  std::vector<Double_t> fHighBkgCut;
286 
287  std::vector<Bool_t> fIsLowSigCut;
288  std::vector<Bool_t> fIsLowBkgCut;
289  std::vector<Bool_t> fIsHighSigCut;
290  std::vector<Bool_t> fIsHighBkgCut;
291 
292  Bool_t fHistoricBool; //historic variable, only needed for "CompatibilityOptions"
293 
294  TString fRegressionLossFunctionBDTGS; // the option string determining the loss function for BDT regression
295  Double_t fHuberQuantile; // the option string determining the quantile for the Huber Loss Function
296  // in BDT regression.
297  LossFunctionBDT* fRegressionLossFunctionBDTG;
298 
299  // debugging flags
300  static const Int_t fgDebugLevel; // debug level determining some printout/control plots etc.
301 
302  // for backward compatibility
303  ClassDef(MethodBDT,0); // Analysis of Boosted Decision Trees
304  };
305 
306 } // namespace TMVA
307 
308 const std::vector<TMVA::DecisionTree*>& TMVA::MethodBDT::GetForest() const { return fForest; }
309 const std::vector<const TMVA::Event*> & TMVA::MethodBDT::GetTrainingEvents() const { return fEventSample; }
310 const std::vector<double>& TMVA::MethodBDT::GetBoostWeights() const { return fBoostWeights; }
311 
312 #endif