Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
MethodBoost.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss,Or Cohen, Jan Therhaag, Eckhard von Toerne
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodCompositeBase *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Virtual base class for all MVA method *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmazer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - CERN, Switzerland *
17  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
20  * *
21  * Copyright (c) 2005-2011: *
22  * CERN, Switzerland *
23  * U. of Victoria, Canada *
24  * MPI-K Heidelberg, Germany *
25  * U. of Bonn, Germany *
26  * *
27  * Redistribution and use in source and binary forms, with or without *
28  * modification, are permitted according to the terms listed in LICENSE *
29  * (http://tmva.sourceforge.net/LICENSE) *
30  **********************************************************************************/
31 
32 /*! \class TMVA::MethodBoost
33 \ingroup TMVA
34 
35 Class for boosting a TMVA method
36 
37 This class is meant to boost a single classifier. Boosting means
38 training the classifier a few times. Every time the weights of the
39 events are modified according to how well the classifier performed
40 on the test sample.
41 
42 */
43 
44 #include "TMVA/MethodBoost.h"
45 
46 #include "TMVA/ClassifierFactory.h"
47 #include "TMVA/Config.h"
48 #include "TMVA/Configurable.h"
49 #include "TMVA/DataSet.h"
50 #include "TMVA/DataSetInfo.h"
51 #include "TMVA/IMethod.h"
52 #include "TMVA/MethodBase.h"
53 #include "TMVA/MethodCategory.h"
55 #include "TMVA/MethodDT.h"
56 #include "TMVA/MethodFisher.h"
57 #include "TMVA/PDF.h"
58 #include "TMVA/Results.h"
59 #include "TMVA/Timer.h"
60 #include "TMVA/Tools.h"
61 #include "TMVA/Types.h"
62 
63 #include "TMVA/SeparationBase.h"
65 #include "TMVA/GiniIndex.h"
66 #include "TMVA/CrossEntropy.h"
69 
70 #include "Riostream.h"
71 #include "TRandom3.h"
72 #include "TFile.h"
73 #include "TMath.h"
74 #include "TObjString.h"
75 #include "TH1F.h"
76 #include "TH2F.h"
77 #include "TGraph.h"
78 #include "TSpline.h"
79 #include "TDirectory.h"
80 #include "TTree.h"
81 
82 #include <algorithm>
83 #include <iomanip>
84 #include <vector>
85 #include <cmath>
86 
87 
88 REGISTER_METHOD(Boost)
89 
90 ClassImp(TMVA::MethodBoost);
91 
92 ////////////////////////////////////////////////////////////////////////////////
93 
94  TMVA::MethodBoost::MethodBoost( const TString& jobName,
95  const TString& methodTitle,
96  DataSetInfo& theData,
97  const TString& theOption ) :
98  TMVA::MethodCompositeBase( jobName, Types::kBoost, methodTitle, theData, theOption)
99  , fBoostNum(0)
100  , fDetailedMonitoring(kFALSE)
101  , fAdaBoostBeta(0)
102  , fRandomSeed(0)
103  , fBaggedSampleFraction(0)
104  , fBoostedMethodTitle(methodTitle)
105  , fBoostedMethodOptions(theOption)
106  , fMonitorBoostedMethod(kFALSE)
107  , fMonitorTree(0)
108  , fBoostWeight(0)
109  , fMethodError(0)
110  , fROC_training(0.0)
111  , fOverlap_integral(0.0)
112  , fMVAvalues(0)
113 {
114  fMVAvalues = new std::vector<Float_t>;
115  fDataSetManager = NULL;
116  fHistoricBoolOption = kFALSE;
117 }
118 
119 ////////////////////////////////////////////////////////////////////////////////
120 
121 TMVA::MethodBoost::MethodBoost( DataSetInfo& dsi,
122  const TString& theWeightFile)
123  : TMVA::MethodCompositeBase( Types::kBoost, dsi, theWeightFile)
124  , fBoostNum(0)
125  , fDetailedMonitoring(kFALSE)
126  , fAdaBoostBeta(0)
127  , fRandomSeed(0)
128  , fBaggedSampleFraction(0)
129  , fBoostedMethodTitle("")
130  , fBoostedMethodOptions("")
131  , fMonitorBoostedMethod(kFALSE)
132  , fMonitorTree(0)
133  , fBoostWeight(0)
134  , fMethodError(0)
135  , fROC_training(0.0)
136  , fOverlap_integral(0.0)
137  , fMVAvalues(0)
138 {
139  fMVAvalues = new std::vector<Float_t>;
140  fDataSetManager = NULL;
141  fHistoricBoolOption = kFALSE;
142 }
143 
144 ////////////////////////////////////////////////////////////////////////////////
145 /// destructor
146 
147 TMVA::MethodBoost::~MethodBoost( void )
148 {
149  fMethodWeight.clear();
150 
151  // the histogram themselves are deleted when the file is closed
152 
153  fTrainSigMVAHist.clear();
154  fTrainBgdMVAHist.clear();
155  fBTrainSigMVAHist.clear();
156  fBTrainBgdMVAHist.clear();
157  fTestSigMVAHist.clear();
158  fTestBgdMVAHist.clear();
159 
160  if (fMVAvalues) {
161  delete fMVAvalues;
162  fMVAvalues = 0;
163  }
164 }
165 
166 
167 ////////////////////////////////////////////////////////////////////////////////
168 /// Boost can handle classification with 2 classes and regression with one regression-target
169 
170 Bool_t TMVA::MethodBoost::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ )
171 {
172  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
173  // if (type == Types::kRegression && numberTargets == 1) return kTRUE;
174  return kFALSE;
175 }
176 
177 
178 ////////////////////////////////////////////////////////////////////////////////
179 
180 void TMVA::MethodBoost::DeclareOptions()
181 {
182  DeclareOptionRef( fBoostNum = 1, "Boost_Num",
183  "Number of times the classifier is boosted" );
184 
185  DeclareOptionRef( fMonitorBoostedMethod = kTRUE, "Boost_MonitorMethod",
186  "Write monitoring histograms for each boosted classifier" );
187 
188  DeclareOptionRef( fDetailedMonitoring = kFALSE, "Boost_DetailedMonitoring",
189  "Produce histograms for detailed boost monitoring" );
190 
191  DeclareOptionRef( fBoostType = "AdaBoost", "Boost_Type", "Boosting type for the classifiers" );
192  AddPreDefVal(TString("RealAdaBoost"));
193  AddPreDefVal(TString("AdaBoost"));
194  AddPreDefVal(TString("Bagging"));
195 
196  DeclareOptionRef(fBaggedSampleFraction=.6,"Boost_BaggedSampleFraction","Relative size of bagged event sample to original size of the data sample (used whenever bagging is used)" );
197 
198  DeclareOptionRef( fAdaBoostBeta = 1.0, "Boost_AdaBoostBeta",
199  "The ADA boost parameter that sets the effect of every boost step on the events' weights" );
200 
201  DeclareOptionRef( fTransformString = "step", "Boost_Transform",
202  "Type of transform applied to every boosted method linear, log, step" );
203  AddPreDefVal(TString("step"));
204  AddPreDefVal(TString("linear"));
205  AddPreDefVal(TString("log"));
206  AddPreDefVal(TString("gauss"));
207 
208  DeclareOptionRef( fRandomSeed = 0, "Boost_RandomSeed",
209  "Seed for random number generator used for bagging" );
210 
211  TMVA::MethodCompositeBase::fMethods.reserve(fBoostNum);
212 }
213 
214 ////////////////////////////////////////////////////////////////////////////////
215 /// options that are used ONLY for the READER to ensure backward compatibility
216 /// they are hence without any effect (the reader is only reading the training
217 /// options that HAD been used at the training of the .xml weight file at hand
218 
219 void TMVA::MethodBoost::DeclareCompatibilityOptions()
220 {
221 
222  MethodBase::DeclareCompatibilityOptions();
223 
224  DeclareOptionRef( fHistoricOption = "ByError", "Boost_MethodWeightType",
225  "How to set the final weight of the boosted classifiers" );
226  AddPreDefVal(TString("ByError"));
227  AddPreDefVal(TString("Average"));
228  AddPreDefVal(TString("ByROC"));
229  AddPreDefVal(TString("ByOverlap"));
230  AddPreDefVal(TString("LastMethod"));
231 
232  DeclareOptionRef( fHistoricOption = "step", "Boost_Transform",
233  "Type of transform applied to every boosted method linear, log, step" );
234  AddPreDefVal(TString("step"));
235  AddPreDefVal(TString("linear"));
236  AddPreDefVal(TString("log"));
237  AddPreDefVal(TString("gauss"));
238 
239  // this option here
240  //DeclareOptionRef( fBoostType = "AdaBoost", "Boost_Type", "Boosting type for the classifiers" );
241  // still exists, but these two possible values
242  AddPreDefVal(TString("HighEdgeGauss"));
243  AddPreDefVal(TString("HighEdgeCoPara"));
244  // have been deleted .. hope that works :)
245 
246  DeclareOptionRef( fHistoricBoolOption, "Boost_RecalculateMVACut",
247  "Recalculate the classifier MVA Signallike cut at every boost iteration" );
248 
249 }
250 
251 ////////////////////////////////////////////////////////////////////////////////
252 /// just registering the string from which the boosted classifier will be created
253 
254 Bool_t TMVA::MethodBoost::BookMethod( Types::EMVA theMethod, TString methodTitle, TString theOption )
255 {
256  fBoostedMethodName = Types::Instance().GetMethodName( theMethod );
257  fBoostedMethodTitle = methodTitle;
258  fBoostedMethodOptions = theOption;
259  TString opts=theOption;
260  opts.ToLower();
261  // if (opts.Contains("vartransform")) Log() << kFATAL << "It is not possible to use boost in conjunction with variable transform. Please remove either Boost_Num or VarTransform from the option string"<< methodTitle<<Endl;
262 
263  return kTRUE;
264 }
265 
266 ////////////////////////////////////////////////////////////////////////////////
267 
268 void TMVA::MethodBoost::Init()
269 {
270 }
271 
272 ////////////////////////////////////////////////////////////////////////////////
273 /// initialisation routine
274 
275 void TMVA::MethodBoost::InitHistos()
276 {
277 
278  Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
279 
280  results->Store(new TH1F("MethodWeight","Normalized Classifier Weight",fBoostNum,0,fBoostNum),"ClassifierWeight");
281  results->Store(new TH1F("BoostWeight","Boost Weight",fBoostNum,0,fBoostNum),"BoostWeight");
282  results->Store(new TH1F("ErrFraction","Error Fraction (by boosted event weights)",fBoostNum,0,fBoostNum),"ErrorFraction");
283  if (fDetailedMonitoring){
284  results->Store(new TH1F("ROCIntegral_test","ROC integral of single classifier (testing sample)",fBoostNum,0,fBoostNum),"ROCIntegral_test");
285  results->Store(new TH1F("ROCIntegralBoosted_test","ROC integral of boosted method (testing sample)",fBoostNum,0,fBoostNum),"ROCIntegralBoosted_test");
286  results->Store(new TH1F("ROCIntegral_train","ROC integral of single classifier (training sample)",fBoostNum,0,fBoostNum),"ROCIntegral_train");
287  results->Store(new TH1F("ROCIntegralBoosted_train","ROC integral of boosted method (training sample)",fBoostNum,0,fBoostNum),"ROCIntegralBoosted_train");
288  results->Store(new TH1F("OverlapIntegal_train","Overlap integral (training sample)",fBoostNum,0,fBoostNum),"Overlap");
289  }
290 
291 
292  results->GetHist("ClassifierWeight")->GetXaxis()->SetTitle("Index of boosted classifier");
293  results->GetHist("ClassifierWeight")->GetYaxis()->SetTitle("Classifier Weight");
294  results->GetHist("BoostWeight")->GetXaxis()->SetTitle("Index of boosted classifier");
295  results->GetHist("BoostWeight")->GetYaxis()->SetTitle("Boost Weight");
296  results->GetHist("ErrorFraction")->GetXaxis()->SetTitle("Index of boosted classifier");
297  results->GetHist("ErrorFraction")->GetYaxis()->SetTitle("Error Fraction");
298  if (fDetailedMonitoring){
299  results->GetHist("ROCIntegral_test")->GetXaxis()->SetTitle("Index of boosted classifier");
300  results->GetHist("ROCIntegral_test")->GetYaxis()->SetTitle("ROC integral of single classifier");
301  results->GetHist("ROCIntegralBoosted_test")->GetXaxis()->SetTitle("Number of boosts");
302  results->GetHist("ROCIntegralBoosted_test")->GetYaxis()->SetTitle("ROC integral boosted");
303  results->GetHist("ROCIntegral_train")->GetXaxis()->SetTitle("Index of boosted classifier");
304  results->GetHist("ROCIntegral_train")->GetYaxis()->SetTitle("ROC integral of single classifier");
305  results->GetHist("ROCIntegralBoosted_train")->GetXaxis()->SetTitle("Number of boosts");
306  results->GetHist("ROCIntegralBoosted_train")->GetYaxis()->SetTitle("ROC integral boosted");
307  results->GetHist("Overlap")->GetXaxis()->SetTitle("Index of boosted classifier");
308  results->GetHist("Overlap")->GetYaxis()->SetTitle("Overlap integral");
309  }
310 
311  results->Store(new TH1F("SoverBtotal","S/B in reweighted training sample",fBoostNum,0,fBoostNum),"SoverBtotal");
312  results->GetHist("SoverBtotal")->GetYaxis()->SetTitle("S/B (boosted sample)");
313  results->GetHist("SoverBtotal")->GetXaxis()->SetTitle("Index of boosted classifier");
314 
315  results->Store(new TH1F("SeparationGain","SeparationGain",fBoostNum,0,fBoostNum),"SeparationGain");
316  results->GetHist("SeparationGain")->GetYaxis()->SetTitle("SeparationGain");
317  results->GetHist("SeparationGain")->GetXaxis()->SetTitle("Index of boosted classifier");
318 
319 
320 
321  fMonitorTree= new TTree("MonitorBoost","Boost variables");
322  fMonitorTree->Branch("iMethod",&fCurrentMethodIdx,"iMethod/I");
323  fMonitorTree->Branch("boostWeight",&fBoostWeight,"boostWeight/D");
324  fMonitorTree->Branch("errorFraction",&fMethodError,"errorFraction/D");
325  fMonitorBoostedMethod = kTRUE;
326 
327 }
328 
329 
330 ////////////////////////////////////////////////////////////////////////////////
331 
332 void TMVA::MethodBoost::CheckSetup()
333 {
334  Log() << kDEBUG << "CheckSetup: fBoostType="<<fBoostType << Endl;
335  Log() << kDEBUG << "CheckSetup: fAdaBoostBeta="<<fAdaBoostBeta<<Endl;
336  Log() << kDEBUG << "CheckSetup: fBoostWeight="<<fBoostWeight<<Endl;
337  Log() << kDEBUG << "CheckSetup: fMethodError="<<fMethodError<<Endl;
338  Log() << kDEBUG << "CheckSetup: fBoostNum="<<fBoostNum << Endl;
339  Log() << kDEBUG << "CheckSetup: fRandomSeed=" << fRandomSeed<< Endl;
340  Log() << kDEBUG << "CheckSetup: fTrainSigMVAHist.size()="<<fTrainSigMVAHist.size()<<Endl;
341  Log() << kDEBUG << "CheckSetup: fTestSigMVAHist.size()="<<fTestSigMVAHist.size()<<Endl;
342  Log() << kDEBUG << "CheckSetup: fMonitorBoostedMethod=" << (fMonitorBoostedMethod? "true" : "false") << Endl;
343  Log() << kDEBUG << "CheckSetup: MName=" << fBoostedMethodName << " Title="<< fBoostedMethodTitle<< Endl;
344  Log() << kDEBUG << "CheckSetup: MOptions="<< fBoostedMethodOptions << Endl;
345  Log() << kDEBUG << "CheckSetup: fMonitorTree=" << fMonitorTree <<Endl;
346  Log() << kDEBUG << "CheckSetup: fCurrentMethodIdx=" <<fCurrentMethodIdx << Endl;
347  if (fMethods.size()>0) Log() << kDEBUG << "CheckSetup: fMethods[0]" <<fMethods[0]<<Endl;
348  Log() << kDEBUG << "CheckSetup: fMethodWeight.size()" << fMethodWeight.size() << Endl;
349  if (fMethodWeight.size()>0) Log() << kDEBUG << "CheckSetup: fMethodWeight[0]="<<fMethodWeight[0]<<Endl;
350  Log() << kDEBUG << "CheckSetup: trying to repair things" << Endl;
351 
352 }
353 ////////////////////////////////////////////////////////////////////////////////
354 
355 void TMVA::MethodBoost::Train()
356 {
357  TDirectory* methodDir( 0 );
358  TString dirName,dirTitle;
359  Int_t StopCounter=0;
360  Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
361 
362 
363  InitHistos();
364 
365  if (Data()->GetNTrainingEvents()==0) Log() << kFATAL << "<Train> Data() has zero events" << Endl;
366  Data()->SetCurrentType(Types::kTraining);
367 
368  if (fMethods.size() > 0) fMethods.clear();
369  fMVAvalues->resize(Data()->GetNTrainingEvents(), 0.0);
370 
371  Log() << kINFO << "Training "<< fBoostNum << " " << fBoostedMethodName << " with title " << fBoostedMethodTitle << " Classifiers ... patience please" << Endl;
372  Timer timer( fBoostNum, GetName() );
373 
374  ResetBoostWeights();
375 
376  // clean boosted method options
377  CleanBoostOptions();
378 
379 
380  // remove transformations for individual boosting steps
381  // the transformation of the main method will be rerouted to each of the boost steps
382  Ssiz_t varTrafoStart=fBoostedMethodOptions.Index("~VarTransform=");
383  if (varTrafoStart >0) {
384  Ssiz_t varTrafoEnd =fBoostedMethodOptions.Index(":",varTrafoStart);
385  if (varTrafoEnd<varTrafoStart)
386  varTrafoEnd=fBoostedMethodOptions.Length();
387  fBoostedMethodOptions.Remove(varTrafoStart,varTrafoEnd-varTrafoStart);
388  }
389 
390  //
391  // training and boosting the classifiers
392  for (fCurrentMethodIdx=0;fCurrentMethodIdx<fBoostNum;fCurrentMethodIdx++) {
393  // the first classifier shows the option string output, the rest not
394  if (fCurrentMethodIdx>0) TMVA::MsgLogger::InhibitOutput();
395 
396  IMethod *method = ClassifierFactory::Instance().Create(
397  fBoostedMethodName.Data(), GetJobName(), Form("%s_B%04i", fBoostedMethodTitle.Data(), fCurrentMethodIdx),
398  DataInfo(), fBoostedMethodOptions);
399  TMVA::MsgLogger::EnableOutput();
400 
401  // suppressing the rest of the classifier output the right way
402  fCurrentMethod = (dynamic_cast<MethodBase*>(method));
403 
404  if (fCurrentMethod==0) {
405  Log() << kFATAL << "uups.. guess the booking of the " << fCurrentMethodIdx << "-th classifier somehow failed" << Endl;
406  return; // hope that makes coverity happy (as if fears I might use the pointer later on, not knowing that FATAL exits
407  }
408 
409  // set fDataSetManager if MethodCategory (to enable Category to create datasetinfo objects) // DSMTEST
410  if (fCurrentMethod->GetMethodType() == Types::kCategory) { // DSMTEST
411  MethodCategory *methCat = (dynamic_cast<MethodCategory*>(fCurrentMethod)); // DSMTEST
412  if (!methCat) // DSMTEST
413  Log() << kFATAL << "Method with type kCategory cannot be casted to MethodCategory. /MethodBoost" << Endl; // DSMTEST
414  methCat->fDataSetManager = fDataSetManager; // DSMTEST
415  } // DSMTEST
416 
417  fCurrentMethod->SetMsgType(kWARNING);
418  fCurrentMethod->SetupMethod();
419  fCurrentMethod->ParseOptions();
420  // put SetAnalysisType here for the needs of MLP
421  fCurrentMethod->SetAnalysisType( GetAnalysisType() );
422  fCurrentMethod->ProcessSetup();
423  fCurrentMethod->CheckSetup();
424 
425 
426  // reroute transformationhandler
427  fCurrentMethod->RerouteTransformationHandler (&(this->GetTransformationHandler()));
428 
429 
430  // creating the directory of the classifier
431  if(!IsSilentFile())
432  {
433  if (fMonitorBoostedMethod) {
434  methodDir=GetFile()->GetDirectory(dirName=Form("%s_B%04i",fBoostedMethodName.Data(),fCurrentMethodIdx));
435  if (methodDir==0) {
436  methodDir=BaseDir()->mkdir(dirName,dirTitle=Form("Directory Boosted %s #%04i", fBoostedMethodName.Data(),fCurrentMethodIdx));
437  }
438  fCurrentMethod->SetMethodDir(methodDir);
439  fCurrentMethod->BaseDir()->cd();
440  }
441  }
442 
443  // training
444  TMVA::MethodCompositeBase::fMethods.push_back(method);
445  timer.DrawProgressBar( fCurrentMethodIdx );
446  if (fCurrentMethodIdx==0) MonitorBoost(Types::kBoostProcBegin,fCurrentMethodIdx);
447  MonitorBoost(Types::kBeforeTraining,fCurrentMethodIdx);
448  TMVA::MsgLogger::InhibitOutput(); //suppressing Logger outside the method
449  if (fBoostType=="Bagging") Bagging(); // you want also to train the first classifier on a bagged sample
450  SingleTrain();
451  TMVA::MsgLogger::EnableOutput();
452  if(!IsSilentFile())fCurrentMethod->WriteMonitoringHistosToFile();
453 
454  // calculate MVA values of current method for all events in training sample
455  // (used later on to get 'misclassified events' etc for the boosting
456  CalcMVAValues();
457 
458  if(!IsSilentFile()) if (fCurrentMethodIdx==0 && fMonitorBoostedMethod) CreateMVAHistorgrams();
459 
460  // get ROC integral and overlap integral for single method on
461  // training sample if fMethodWeightType == "ByROC" or the user
462  // wants detailed monitoring
463 
464  // boosting (reweight training sample)
465  MonitorBoost(Types::kBeforeBoosting,fCurrentMethodIdx);
466  SingleBoost(fCurrentMethod);
467 
468  MonitorBoost(Types::kAfterBoosting,fCurrentMethodIdx);
469  results->GetHist("BoostWeight")->SetBinContent(fCurrentMethodIdx+1,fBoostWeight);
470  results->GetHist("ErrorFraction")->SetBinContent(fCurrentMethodIdx+1,fMethodError);
471 
472  if (fDetailedMonitoring) {
473  fROC_training = GetBoostROCIntegral(kTRUE, Types::kTraining, kTRUE);
474  results->GetHist("ROCIntegral_test")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kTRUE, Types::kTesting));
475  results->GetHist("ROCIntegralBoosted_test")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kFALSE, Types::kTesting));
476  results->GetHist("ROCIntegral_train")->SetBinContent(fCurrentMethodIdx+1, fROC_training);
477  results->GetHist("ROCIntegralBoosted_train")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kFALSE, Types::kTraining));
478  results->GetHist("Overlap")->SetBinContent(fCurrentMethodIdx+1, fOverlap_integral);
479  }
480 
481 
482 
483  fMonitorTree->Fill();
484 
485  // stop boosting if needed when error has reached 0.5
486  // thought of counting a few steps, but it doesn't seem to be necessary
487  Log() << kDEBUG << "AdaBoost (methodErr) err = " << fMethodError << Endl;
488  if (fMethodError > 0.49999) StopCounter++;
489  if (StopCounter > 0 && fBoostType != "Bagging") {
490  timer.DrawProgressBar( fBoostNum );
491  fBoostNum = fCurrentMethodIdx+1;
492  Log() << kINFO << "Error rate has reached 0.5 ("<< fMethodError<<"), boosting process stopped at #" << fBoostNum << " classifier" << Endl;
493  if (fBoostNum < 5)
494  Log() << kINFO << "The classifier might be too strong to boost with Beta = " << fAdaBoostBeta << ", try reducing it." <<Endl;
495  break;
496  }
497  }
498 
499  //as MethodBoost acts not on a private event sample (like MethodBDT does), we need to remember not
500  // to leave "boosted" events to the next classifier in the factory
501 
502  ResetBoostWeights();
503 
504  Timer* timer1= new Timer( fBoostNum, GetName() );
505  // normalizing the weights of the classifiers
506  for (fCurrentMethodIdx=0;fCurrentMethodIdx<fBoostNum;fCurrentMethodIdx++) {
507  // performing post-boosting actions
508 
509  timer1->DrawProgressBar( fCurrentMethodIdx );
510 
511  if (fCurrentMethodIdx==fBoostNum) {
512  Log() << kINFO << "Elapsed time: " << timer1->GetElapsedTime()
513  << " " << Endl;
514  }
515 
516  TH1F* tmp = dynamic_cast<TH1F*>( results->GetHist("ClassifierWeight") );
517  if (tmp) tmp->SetBinContent(fCurrentMethodIdx+1,fMethodWeight[fCurrentMethodIdx]);
518 
519  }
520 
521  // Ensure that in case of only 1 boost the method weight equals
522  // 1.0. This avoids unexpected behaviour in case of very bad
523  // classifiers which have fBoostWeight=1 or fMethodError=0.5,
524  // because their weight would be set to zero. This behaviour is
525  // not ok if one boosts just one time.
526  if (fMethods.size()==1) fMethodWeight[0] = 1.0;
527 
528  MonitorBoost(Types::kBoostProcEnd);
529 
530  delete timer1;
531 }
532 
533 ////////////////////////////////////////////////////////////////////////////////
534 
535 void TMVA::MethodBoost::CleanBoostOptions()
536 {
537  fBoostedMethodOptions=GetOptions();
538 }
539 
540 ////////////////////////////////////////////////////////////////////////////////
541 
542 void TMVA::MethodBoost::CreateMVAHistorgrams()
543 {
544  if (fBoostNum <=0) Log() << kFATAL << "CreateHistograms called before fBoostNum is initialized" << Endl;
545  // calculating histograms boundaries and creating histograms..
546  // nrms = number of rms around the average to use for outline (of the 0 classifier)
547  Double_t meanS, meanB, rmsS, rmsB, xmin, xmax, nrms = 10;
548  Int_t signalClass = 0;
549  if (DataInfo().GetClassInfo("Signal") != 0) {
550  signalClass = DataInfo().GetClassInfo("Signal")->GetNumber();
551  }
552  gTools().ComputeStat( GetEventCollection( Types::kMaxTreeType ), fMVAvalues,
553  meanS, meanB, rmsS, rmsB, xmin, xmax, signalClass );
554 
555  fNbins = gConfig().fVariablePlotting.fNbinsXOfROCCurve;
556  xmin = TMath::Max( TMath::Min(meanS - nrms*rmsS, meanB - nrms*rmsB ), xmin );
557  xmax = TMath::Min( TMath::Max(meanS + nrms*rmsS, meanB + nrms*rmsB ), xmax ) + 0.00001;
558 
559  // creating all the histograms
560  for (UInt_t imtd=0; imtd<fBoostNum; imtd++) {
561  fTrainSigMVAHist .push_back( new TH1F( Form("MVA_Train_S_%04i",imtd), "MVA_Train_S", fNbins, xmin, xmax ) );
562  fTrainBgdMVAHist .push_back( new TH1F( Form("MVA_Train_B%04i", imtd), "MVA_Train_B", fNbins, xmin, xmax ) );
563  fBTrainSigMVAHist.push_back( new TH1F( Form("MVA_BTrain_S%04i",imtd), "MVA_BoostedTrain_S", fNbins, xmin, xmax ) );
564  fBTrainBgdMVAHist.push_back( new TH1F( Form("MVA_BTrain_B%04i",imtd), "MVA_BoostedTrain_B", fNbins, xmin, xmax ) );
565  fTestSigMVAHist .push_back( new TH1F( Form("MVA_Test_S%04i", imtd), "MVA_Test_S", fNbins, xmin, xmax ) );
566  fTestBgdMVAHist .push_back( new TH1F( Form("MVA_Test_B%04i", imtd), "MVA_Test_B", fNbins, xmin, xmax ) );
567  }
568 }
569 
570 ////////////////////////////////////////////////////////////////////////////////
571 /// resetting back the boosted weights of the events to 1
572 
573 void TMVA::MethodBoost::ResetBoostWeights()
574 {
575  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
576  const Event *ev = Data()->GetEvent(ievt);
577  ev->SetBoostWeight( 1.0 );
578  }
579 }
580 
581 ////////////////////////////////////////////////////////////////////////////////
582 
583 void TMVA::MethodBoost::WriteMonitoringHistosToFile( void ) const
584 {
585  TDirectory* dir=0;
586  if (fMonitorBoostedMethod) {
587  for (UInt_t imtd=0;imtd<fBoostNum;imtd++) {
588 
589  //writing the histograms in the specific classifier's directory
590  MethodBase* m = dynamic_cast<MethodBase*>(fMethods[imtd]);
591  if (!m) continue;
592  dir = m->BaseDir();
593  dir->cd();
594  fTrainSigMVAHist[imtd]->SetDirectory(dir);
595  fTrainSigMVAHist[imtd]->Write();
596  fTrainBgdMVAHist[imtd]->SetDirectory(dir);
597  fTrainBgdMVAHist[imtd]->Write();
598  fBTrainSigMVAHist[imtd]->SetDirectory(dir);
599  fBTrainSigMVAHist[imtd]->Write();
600  fBTrainBgdMVAHist[imtd]->SetDirectory(dir);
601  fBTrainBgdMVAHist[imtd]->Write();
602  }
603  }
604 
605  // going back to the original folder
606  BaseDir()->cd();
607 
608  fMonitorTree->Write();
609 }
610 
611 ////////////////////////////////////////////////////////////////////////////////
612 
613 void TMVA::MethodBoost::TestClassification()
614 {
615  MethodBase::TestClassification();
616  if (fMonitorBoostedMethod) {
617  UInt_t nloop = fTestSigMVAHist.size();
618  if (fMethods.size()<nloop) nloop = fMethods.size();
619  //running over all the events and populating the test MVA histograms
620  Data()->SetCurrentType(Types::kTesting);
621  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
622  const Event* ev = GetEvent(ievt);
623  Float_t w = ev->GetWeight();
624  if (DataInfo().IsSignal(ev)) {
625  for (UInt_t imtd=0; imtd<nloop; imtd++) {
626  fTestSigMVAHist[imtd]->Fill(fMethods[imtd]->GetMvaValue(),w);
627  }
628  }
629  else {
630  for (UInt_t imtd=0; imtd<nloop; imtd++) {
631  fTestBgdMVAHist[imtd]->Fill(fMethods[imtd]->GetMvaValue(),w);
632  }
633  }
634  }
635  Data()->SetCurrentType(Types::kTraining);
636  }
637 }
638 
639 ////////////////////////////////////////////////////////////////////////////////
640 
641 void TMVA::MethodBoost::WriteEvaluationHistosToFile(Types::ETreeType treetype)
642 {
643  MethodBase::WriteEvaluationHistosToFile(treetype);
644  if (treetype==Types::kTraining) return;
645  UInt_t nloop = fTestSigMVAHist.size();
646  if (fMethods.size()<nloop) nloop = fMethods.size();
647  if (fMonitorBoostedMethod) {
648  TDirectory* dir=0;
649  for (UInt_t imtd=0;imtd<nloop;imtd++) {
650  //writing the histograms in the specific classifier's directory
651  MethodBase* mva = dynamic_cast<MethodBase*>(fMethods[imtd]);
652  if (!mva) continue;
653  dir = mva->BaseDir();
654  if (dir==0) continue;
655  dir->cd();
656  fTestSigMVAHist[imtd]->SetDirectory(dir);
657  fTestSigMVAHist[imtd]->Write();
658  fTestBgdMVAHist[imtd]->SetDirectory(dir);
659  fTestBgdMVAHist[imtd]->Write();
660  }
661  }
662 }
663 
664 ////////////////////////////////////////////////////////////////////////////////
665 /// process user options
666 
667 void TMVA::MethodBoost::ProcessOptions()
668 {
669 }
670 
671 ////////////////////////////////////////////////////////////////////////////////
672 /// initialization
673 
674 void TMVA::MethodBoost::SingleTrain()
675 {
676  Data()->SetCurrentType(Types::kTraining);
677  MethodBase* meth = dynamic_cast<MethodBase*>(GetLastMethod());
678  if (meth){
679  meth->SetSilentFile(IsSilentFile());
680  if(IsModelPersistence()){
681  TString _fFileDir= DataInfo().GetName();
682  _fFileDir+="/"+gConfig().GetIONames().fWeightFileDir;
683  meth->SetWeightFileDir(_fFileDir);
684  }
685  meth->SetModelPersistence(IsModelPersistence());
686  meth->TrainMethod();
687  }
688 }
689 
690 ////////////////////////////////////////////////////////////////////////////////
691 /// find the CUT on the individual MVA that defines an event as
692 /// correct or misclassified (to be used in the boosting process)
693 
694 void TMVA::MethodBoost::FindMVACut(MethodBase *method)
695 {
696  if (!method || method->GetMethodType() == Types::kDT ){ return;}
697 
698  // creating a fine histograms containing the error rate
699  const Int_t nBins=10001;
700  Double_t minMVA=150000;
701  Double_t maxMVA=-150000;
702  for (Long64_t ievt=0; ievt<Data()->GetNEvents(); ievt++) {
703  GetEvent(ievt);
704  Double_t val=method->GetMvaValue();
705  //Helge .. I think one could very well use fMVAValues for that ... -->to do
706  if (val>maxMVA) maxMVA=val;
707  if (val<minMVA) minMVA=val;
708  }
709  maxMVA = maxMVA+(maxMVA-minMVA)/nBins;
710 
711  Double_t sum = 0.;
712 
713  TH1D *mvaS = new TH1D(Form("MVAS_%d",fCurrentMethodIdx) ,"",nBins,minMVA,maxMVA);
714  TH1D *mvaB = new TH1D(Form("MVAB_%d",fCurrentMethodIdx) ,"",nBins,minMVA,maxMVA);
715  TH1D *mvaSC = new TH1D(Form("MVASC_%d",fCurrentMethodIdx),"",nBins,minMVA,maxMVA);
716  TH1D *mvaBC = new TH1D(Form("MVABC_%d",fCurrentMethodIdx),"",nBins,minMVA,maxMVA);
717 
718 
719  Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
720  if (fDetailedMonitoring){
721  results->Store(mvaS, Form("MVAS_%d",fCurrentMethodIdx));
722  results->Store(mvaB, Form("MVAB_%d",fCurrentMethodIdx));
723  results->Store(mvaSC,Form("MVASC_%d",fCurrentMethodIdx));
724  results->Store(mvaBC,Form("MVABC_%d",fCurrentMethodIdx));
725  }
726 
727  for (Long64_t ievt=0; ievt<Data()->GetNEvents(); ievt++) {
728 
729  Double_t weight = GetEvent(ievt)->GetWeight();
730  Double_t mvaVal=method->GetMvaValue();
731  sum +=weight;
732  if (DataInfo().IsSignal(GetEvent(ievt))){
733  mvaS->Fill(mvaVal,weight);
734  }else {
735  mvaB->Fill(mvaVal,weight);
736  }
737  }
738  SeparationBase *sepGain;
739 
740 
741  // Boosting should use Misclassification not Gini Index (changed, Helge 31.5.2013)
742  // WARNING! It works with Misclassification only if you fix the signal to
743  // background at every step. Strangely enough, there are better results
744  // ( as seen with BDT ) if you use Gini Index, and accept that sometimes no
745  // sensible cut is found - i.e. the cut is then outside the MVA value range,
746  // all events are classified as background and then according to the Boost
747  // algorithm something is renormed 'automatically' ... so that in the next
748  // step again the result is something sensible.
749  // Strange ... that THIS is supposed to be right?
750 
751  // SeparationBase *sepGain2 = new MisClassificationError();
752  //sepGain = new MisClassificationError();
753  sepGain = new GiniIndex();
754  //sepGain = new CrossEntropy();
755 
756  Double_t sTot = mvaS->GetSum();
757  Double_t bTot = mvaB->GetSum();
758 
759  mvaSC->SetBinContent(1,mvaS->GetBinContent(1));
760  mvaBC->SetBinContent(1,mvaB->GetBinContent(1));
761  Double_t sSel=0;
762  Double_t bSel=0;
763  Double_t separationGain=sepGain->GetSeparationGain(sSel,bSel,sTot,bTot);
764  Double_t mvaCut=mvaSC->GetBinLowEdge(1);
765  Double_t sSelCut=sSel;
766  Double_t bSelCut=bSel;
767  // std::cout << "minMVA =" << minMVA << " maxMVA = " << maxMVA << " width = " << mvaSC->GetBinWidth(1) << std::endl;
768 
769  // for (Int_t ibin=1;ibin<=nBins;ibin++) std::cout << " cutvalues[" << ibin<<"]="<<mvaSC->GetBinLowEdge(ibin) << " " << mvaSC->GetBinCenter(ibin) << std::endl;
770  Double_t mvaCutOrientation=1; // 1 if mva > mvaCut --> Signal and -1 if mva < mvaCut (i.e. mva*-1 > mvaCut*-1) --> Signal
771  for (Int_t ibin=1;ibin<=nBins;ibin++){
772  mvaSC->SetBinContent(ibin,mvaS->GetBinContent(ibin)+mvaSC->GetBinContent(ibin-1));
773  mvaBC->SetBinContent(ibin,mvaB->GetBinContent(ibin)+mvaBC->GetBinContent(ibin-1));
774 
775  sSel=mvaSC->GetBinContent(ibin);
776  bSel=mvaBC->GetBinContent(ibin);
777 
778  // if (ibin==nBins){
779  // std::cout << "Last bin s="<< sSel <<" b="<<bSel << " s="<< sTot-sSel <<" b="<<bTot-bSel << endl;
780  // }
781 
782  if (separationGain < sepGain->GetSeparationGain(sSel,bSel,sTot,bTot)
783  // && (mvaSC->GetBinCenter(ibin) >0 || (fCurrentMethodIdx+1)%2 )
784  ){
785  separationGain = sepGain->GetSeparationGain(sSel,bSel,sTot,bTot);
786  // mvaCut=mvaSC->GetBinCenter(ibin);
787  mvaCut=mvaSC->GetBinLowEdge(ibin+1);
788  // if (sSel/bSel > (sTot-sSel)/(bTot-bSel)) mvaCutOrientation=-1;
789  if (sSel*(bTot-bSel) > (sTot-sSel)*bSel) mvaCutOrientation=-1;
790  else mvaCutOrientation=1;
791  sSelCut=sSel;
792  bSelCut=bSel;
793  // std::cout << "new cut at " << mvaCut << "with s="<<sTot-sSel << " b="<<bTot-bSel << std::endl;
794  }
795  /*
796  Double_t ori;
797  if (sSel/bSel > (sTot-sSel)/(bTot-bSel)) ori=-1;
798  else ori=1;
799  std::cout << ibin << " mvacut="<<mvaCut
800  << " sTot=" << sTot
801  << " bTot=" << bTot
802  << " sSel=" << sSel
803  << " bSel=" << bSel
804  << " s/b(1)=" << sSel/bSel
805  << " s/b(2)=" << (sTot-sSel)/(bTot-bSel)
806  << " sepGain="<<sepGain->GetSeparationGain(sSel,bSel,sTot,bTot)
807  << " sepGain2="<<sepGain2->GetSeparationGain(sSel,bSel,sTot,bTot)
808  << " " <<ori
809  << std::endl;
810  */
811 
812  }
813 
814  if (0){
815  double parentIndex=sepGain->GetSeparationIndex(sTot,bTot);
816  double leftIndex =sepGain->GetSeparationIndex(sSelCut,bSelCut);
817  double rightIndex =sepGain->GetSeparationIndex(sTot-sSelCut,bTot-bSelCut);
818  std::cout
819  << " sTot=" << sTot
820  << " bTot=" << bTot
821  << " s="<<sSelCut
822  << " b="<<bSelCut
823  << " s2="<<(sTot-sSelCut)
824  << " b2="<<(bTot-bSelCut)
825  << " s/b(1)=" << sSelCut/bSelCut
826  << " s/b(2)=" << (sTot-sSelCut)/(bTot-bSelCut)
827  << " index before cut=" << parentIndex
828  << " after: left=" << leftIndex
829  << " after: right=" << rightIndex
830  << " sepGain=" << parentIndex-( (sSelCut+bSelCut) * leftIndex + (sTot-sSelCut+bTot-bSelCut) * rightIndex )/(sTot+bTot)
831  << " sepGain="<<separationGain
832  << " sepGain="<<sepGain->GetSeparationGain(sSelCut,bSelCut,sTot,bTot)
833  << " cut=" << mvaCut
834  << " idx="<<fCurrentMethodIdx
835  << " cutOrientation="<<mvaCutOrientation
836  << std::endl;
837  }
838  method->SetSignalReferenceCut(mvaCut);
839  method->SetSignalReferenceCutOrientation(mvaCutOrientation);
840 
841  results->GetHist("SeparationGain")->SetBinContent(fCurrentMethodIdx+1,separationGain);
842 
843 
844  Log() << kDEBUG << "(old step) Setting method cut to " <<method->GetSignalReferenceCut()<< Endl;
845 
846  if(IsSilentFile())
847  {
848  mvaS ->Delete();
849  mvaB ->Delete();
850  mvaSC->Delete();
851  mvaBC->Delete();
852  }
853 }
854 
855 ////////////////////////////////////////////////////////////////////////////////
856 
857 Double_t TMVA::MethodBoost::SingleBoost(MethodBase* method)
858 {
859  Double_t returnVal=-1;
860 
861 
862  if (fBoostType=="AdaBoost") returnVal = this->AdaBoost (method,1);
863  else if (fBoostType=="RealAdaBoost") returnVal = this->AdaBoost (method,0);
864  else if (fBoostType=="Bagging") returnVal = this->Bagging ();
865  else{
866  Log() << kFATAL << "<Boost> unknown boost option " << fBoostType<< " called" << Endl;
867  }
868  fMethodWeight.push_back(returnVal);
869  return returnVal;
870 }
871 ////////////////////////////////////////////////////////////////////////////////
872 /// the standard (discrete or real) AdaBoost algorithm
873 
874 Double_t TMVA::MethodBoost::AdaBoost(MethodBase* method, Bool_t discreteAdaBoost)
875 {
876  if (!method) {
877  Log() << kWARNING << " AdaBoost called without classifier reference - needed for calculating AdaBoost " << Endl;
878  return 0;
879  }
880 
881  Float_t w,v; Bool_t sig=kTRUE;
882  Double_t sumAll=0, sumWrong=0;
883  Bool_t* WrongDetection=new Bool_t[GetNEvents()];
884  QuickMVAProbEstimator *MVAProb=NULL;
885 
886  if (discreteAdaBoost) {
887  FindMVACut(method);
888  Log() << kDEBUG << " individual mva cut value = " << method->GetSignalReferenceCut() << Endl;
889  } else {
890  MVAProb=new TMVA::QuickMVAProbEstimator();
891  // the RealAdaBoost does use a simple "yes (signal)" or "no (background)"
892  // answer from your single MVA, but a "signal probability" instead (in the BDT case,
893  // that would be the 'purity' in the leaf node. For some MLP parameter, the MVA output
894  // can also interpreted as a probability, but here I try a general approach to get this
895  // probability from the MVA distributions...
896 
897  for (Long64_t evt=0; evt<GetNEvents(); evt++) {
898  const Event* ev = Data()->GetEvent(evt);
899  MVAProb->AddEvent(fMVAvalues->at(evt),ev->GetWeight(),ev->GetClass());
900  }
901  }
902 
903 
904  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) WrongDetection[ievt]=kTRUE;
905 
906  // finding the wrong events and calculating their total weights
907  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
908  const Event* ev = GetEvent(ievt);
909  sig=DataInfo().IsSignal(ev);
910  v = fMVAvalues->at(ievt);
911  w = ev->GetWeight();
912  sumAll += w;
913  if(!IsSilentFile())
914  {
915  if (fMonitorBoostedMethod) {
916  if (sig) {
917  fBTrainSigMVAHist[fCurrentMethodIdx]->Fill(v,w);
918  fTrainSigMVAHist[fCurrentMethodIdx]->Fill(v,ev->GetOriginalWeight());
919  }
920  else {
921  fBTrainBgdMVAHist[fCurrentMethodIdx]->Fill(v,w);
922  fTrainBgdMVAHist[fCurrentMethodIdx]->Fill(v,ev->GetOriginalWeight());
923  }
924  }
925  }
926 
927  if (discreteAdaBoost){
928  if (sig == method->IsSignalLike(fMVAvalues->at(ievt))){
929  WrongDetection[ievt]=kFALSE;
930  }else{
931  WrongDetection[ievt]=kTRUE;
932  sumWrong+=w;
933  }
934  }else{
935  Double_t mvaProb = MVAProb->GetMVAProbAt((Float_t)fMVAvalues->at(ievt));
936  mvaProb = 2*(mvaProb-0.5);
937  Int_t trueType;
938  if (DataInfo().IsSignal(ev)) trueType = 1;
939  else trueType = -1;
940  sumWrong+= w*trueType*mvaProb;
941  }
942  }
943 
944  fMethodError=sumWrong/sumAll;
945 
946  // calculating the fMethodError and the boostWeight out of it uses the formula
947  // w = ((1-err)/err)^beta
948 
949  Double_t boostWeight=0;
950 
951  if (fMethodError == 0) { //no misclassification made.. perfect, no boost ;)
952  Log() << kWARNING << "Your classifier worked perfectly on the training sample --> serious overtraining expected and no boosting done " << Endl;
953  }else{
954 
955  if (discreteAdaBoost)
956  boostWeight = TMath::Log((1.-fMethodError)/fMethodError)*fAdaBoostBeta;
957  else
958  boostWeight = TMath::Log((1.+fMethodError)/(1-fMethodError))*fAdaBoostBeta;
959 
960 
961  // std::cout << "boostweight = " << boostWeight << std::endl;
962 
963  // ADA boosting, rescaling the weight of the wrong events according to the error level
964  // over the entire test sample rescaling all the weights to have the same sum, but without
965  // touching the original weights (changing only the boosted weight of all the events)
966  // first reweight
967 
968  Double_t newSum=0., oldSum=0.;
969 
970 
971  Double_t boostfactor = TMath::Exp(boostWeight);
972 
973 
974  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
975  const Event* ev = Data()->GetEvent(ievt);
976  oldSum += ev->GetWeight();
977  if (discreteAdaBoost){
978  // events are classified as Signal OR background .. right or wrong
979  if (WrongDetection[ievt] && boostWeight != 0) {
980  if (ev->GetWeight() > 0) ev->ScaleBoostWeight(boostfactor);
981  else ev->ScaleBoostWeight(1./boostfactor);
982  }
983  // if (ievt<30) std::cout<<ievt<<" var0="<<ev->GetValue(0)<<" var1="<<ev->GetValue(1)<<" weight="<<ev->GetWeight() << " boostby:"<<boostfactor<<std::endl;
984 
985  }else{
986  // events are classified by their probability of being signal or background
987  // (eventually you should write this one - i.e. re-use the MVA value that were already
988  // calculated and stored.. however ,for the moment ..
989  Double_t mvaProb = MVAProb->GetMVAProbAt((Float_t)fMVAvalues->at(ievt));
990  mvaProb = 2*(mvaProb-0.5);
991  // mvaProb = (1-mvaProb);
992 
993  Int_t trueType=1;
994  if (DataInfo().IsSignal(ev)) trueType = 1;
995  else trueType = -1;
996 
997  boostfactor = TMath::Exp(-1*boostWeight*trueType*mvaProb);
998  if (ev->GetWeight() > 0) ev->ScaleBoostWeight(boostfactor);
999  else ev->ScaleBoostWeight(1./boostfactor);
1000 
1001  }
1002  newSum += ev->GetWeight();
1003  }
1004 
1005  Double_t normWeight = oldSum/newSum;
1006  // next normalize the weights
1007  Double_t normSig=0, normBkg=0;
1008  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1009  const Event* ev = Data()->GetEvent(ievt);
1010  ev->ScaleBoostWeight(normWeight);
1011  if (ev->GetClass()) normSig+=ev->GetWeight();
1012  else normBkg+=ev->GetWeight();
1013  }
1014 
1015  Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
1016  results->GetHist("SoverBtotal")->SetBinContent(fCurrentMethodIdx+1, normSig/normBkg);
1017 
1018  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1019  const Event* ev = Data()->GetEvent(ievt);
1020 
1021  if (ev->GetClass()) ev->ScaleBoostWeight(oldSum/normSig/2);
1022  else ev->ScaleBoostWeight(oldSum/normBkg/2);
1023  }
1024  }
1025 
1026  delete[] WrongDetection;
1027  if (MVAProb) delete MVAProb;
1028 
1029  fBoostWeight = boostWeight; // used ONLY for the monitoring tree
1030 
1031  return boostWeight;
1032 }
1033 
1034 
1035 ////////////////////////////////////////////////////////////////////////////////
1036 /// Bagging or Bootstrap boosting, gives new random poisson weight for every event
1037 
1038 Double_t TMVA::MethodBoost::Bagging()
1039 {
1040  TRandom3 *trandom = new TRandom3(fRandomSeed+fMethods.size());
1041  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1042  const Event* ev = Data()->GetEvent(ievt);
1043  ev->SetBoostWeight(trandom->PoissonD(fBaggedSampleFraction));
1044  }
1045  fBoostWeight = 1; // used ONLY for the monitoring tree
1046  return 1.;
1047 }
1048 
1049 
1050 ////////////////////////////////////////////////////////////////////////////////
1051 /// Get help message text
1052 ///
1053 /// typical length of text line:
1054 /// "|--------------------------------------------------------------|"
1055 
1056 void TMVA::MethodBoost::GetHelpMessage() const
1057 {
1058  Log() << Endl;
1059  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
1060  Log() << Endl;
1061  Log() << "This method combines several classifier of one species in a "<<Endl;
1062  Log() << "single multivariate quantity via the boost algorithm." << Endl;
1063  Log() << "the output is a weighted sum over all individual classifiers" <<Endl;
1064  Log() << "By default, the AdaBoost method is employed, which gives " << Endl;
1065  Log() << "events that were misclassified in the previous tree a larger " << Endl;
1066  Log() << "weight in the training of the following classifier."<<Endl;
1067  Log() << "Optionally, Bagged boosting can also be applied." << Endl;
1068  Log() << Endl;
1069  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
1070  Log() << Endl;
1071  Log() << "The most important parameter in the configuration is the "<<Endl;
1072  Log() << "number of boosts applied (Boost_Num) and the choice of boosting"<<Endl;
1073  Log() << "(Boost_Type), which can be set to either AdaBoost or Bagging." << Endl;
1074  Log() << "AdaBoosting: The most important parameters in this configuration" <<Endl;
1075  Log() << "is the beta parameter (Boost_AdaBoostBeta) " << Endl;
1076  Log() << "When boosting a linear classifier, it is sometimes advantageous"<<Endl;
1077  Log() << "to transform the MVA output non-linearly. The following options" <<Endl;
1078  Log() << "are available: step, log, and minmax, the default is no transform."<<Endl;
1079  Log() <<Endl;
1080  Log() << "Some classifiers are hard to boost and do not improve much in"<<Endl;
1081  Log() << "their performance by boosting them, some even slightly deteriorate"<< Endl;
1082  Log() << "due to the boosting." <<Endl;
1083  Log() << "The booking of the boost method is special since it requires"<<Endl;
1084  Log() << "the booing of the method to be boosted and the boost itself."<<Endl;
1085  Log() << "This is solved by booking the method to be boosted and to add"<<Endl;
1086  Log() << "all Boost parameters, which all begin with \"Boost_\" to the"<<Endl;
1087  Log() << "options string. The factory separates the options and initiates"<<Endl;
1088  Log() << "the boost process. The TMVA macro directory contains the example"<<Endl;
1089  Log() << "macro \"Boost.C\"" <<Endl;
1090 }
1091 
1092 ////////////////////////////////////////////////////////////////////////////////
1093 
1094 const TMVA::Ranking* TMVA::MethodBoost::CreateRanking()
1095 {
1096  return 0;
1097 }
1098 
1099 ////////////////////////////////////////////////////////////////////////////////
1100 /// return boosted MVA response
1101 
1102 Double_t TMVA::MethodBoost::GetMvaValue( Double_t* err, Double_t* errUpper )
1103 {
1104  Double_t mvaValue = 0;
1105  Double_t norm = 0;
1106  Double_t epsilon = TMath::Exp(-1.);
1107  //Double_t fact = TMath::Exp(-1.)+TMath::Exp(1.);
1108  for (UInt_t i=0;i< fMethods.size(); i++){
1109  MethodBase* m = dynamic_cast<MethodBase*>(fMethods[i]);
1110  if (m==0) continue;
1111  Double_t val = fTmpEvent ? m->GetMvaValue(fTmpEvent) : m->GetMvaValue();
1112  Double_t sigcut = m->GetSignalReferenceCut();
1113 
1114  // default is no transform
1115  if (fTransformString == "linear"){
1116 
1117  }
1118  else if (fTransformString == "log"){
1119  if (val < sigcut) val = sigcut;
1120 
1121  val = TMath::Log((val-sigcut)+epsilon);
1122  }
1123  else if (fTransformString == "step" ){
1124  if (m->IsSignalLike(val)) val = 1.;
1125  else val = -1.;
1126  }
1127  else if (fTransformString == "gauss"){
1128  val = TMath::Gaus((val-sigcut),1);
1129  }
1130  else {
1131  Log() << kFATAL << "error unknown transformation " << fTransformString<<Endl;
1132  }
1133  mvaValue+=val*fMethodWeight[i];
1134  norm +=fMethodWeight[i];
1135  // std::cout << "mva("<<i<<") = "<<val<<" " << valx<< " " << mvaValue<<" and sigcut="<<sigcut << std::endl;
1136  }
1137  mvaValue/=norm;
1138  // cannot determine error
1139  NoErrorCalc(err, errUpper);
1140 
1141  return mvaValue;
1142 }
1143 
1144 ////////////////////////////////////////////////////////////////////////////////
1145 /// Calculate the ROC integral of a single classifier or even the
1146 /// whole boosted classifier. The tree type (training or testing
1147 /// sample) is specified by 'eTT'.
1148 ///
1149 /// If tree type kTraining is set, the original training sample is
1150 /// used to compute the ROC integral (original weights).
1151 ///
1152 /// - singleMethod - if kTRUE, return ROC integral of single (last
1153 /// trained) classifier; if kFALSE, return ROC
1154 /// integral of full classifier
1155 ///
1156 /// - eTT - tree type (Types::kTraining / Types::kTesting)
1157 ///
1158 /// - CalcOverlapIntergral - if kTRUE, the overlap integral of the
1159 /// signal/background MVA distributions
1160 /// is calculated and stored in
1161 /// 'fOverlap_integral'
1162 
1163 Double_t TMVA::MethodBoost::GetBoostROCIntegral(Bool_t singleMethod, Types::ETreeType eTT, Bool_t CalcOverlapIntergral)
1164 {
1165  // set data sample training / testing
1166  Data()->SetCurrentType(eTT);
1167 
1168  MethodBase* method = singleMethod ? dynamic_cast<MethodBase*>(fMethods.back()) : 0; // ToDo CoVerity flags this line as there is no protection against a zero-pointer delivered by dynamic_cast
1169  // to make CoVerity happy (although, OF COURSE, the last method in the committee
1170  // has to be also of type MethodBase as ANY method is... hence the dynamic_cast
1171  // will never by "zero" ...
1172  if (singleMethod && !method) {
1173  Log() << kFATAL << " What do you do? Your method:"
1174  << fMethods.back()->GetName()
1175  << " seems not to be a propper TMVA method"
1176  << Endl;
1177  std::exit(1);
1178  }
1179  Double_t err = 0.0;
1180 
1181  // temporary renormalize the method weights in case of evaluation
1182  // of full classifier.
1183  // save the old normalization of the methods
1184  std::vector<Double_t> OldMethodWeight(fMethodWeight);
1185  if (!singleMethod) {
1186  // calculate sum of weights of all methods
1187  Double_t AllMethodsWeight = 0;
1188  for (UInt_t i=0; i<=fCurrentMethodIdx; i++)
1189  AllMethodsWeight += fMethodWeight.at(i);
1190  // normalize the weights of the classifiers
1191  if (AllMethodsWeight != 0.0) {
1192  for (UInt_t i=0; i<=fCurrentMethodIdx; i++)
1193  fMethodWeight[i] /= AllMethodsWeight;
1194  }
1195  }
1196 
1197  // calculate MVA values
1198  Double_t meanS, meanB, rmsS, rmsB, xmin, xmax, nrms = 10;
1199  std::vector <Float_t>* mvaRes;
1200  if (singleMethod && eTT==Types::kTraining)
1201  mvaRes = fMVAvalues; // values already calculated
1202  else {
1203  mvaRes = new std::vector <Float_t>(GetNEvents());
1204  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1205  GetEvent(ievt);
1206  (*mvaRes)[ievt] = singleMethod ? method->GetMvaValue(&err) : GetMvaValue(&err);
1207  }
1208  }
1209 
1210  // restore the method weights
1211  if (!singleMethod)
1212  fMethodWeight = OldMethodWeight;
1213 
1214  // now create histograms for calculation of the ROC integral
1215  Int_t signalClass = 0;
1216  if (DataInfo().GetClassInfo("Signal") != 0) {
1217  signalClass = DataInfo().GetClassInfo("Signal")->GetNumber();
1218  }
1219  gTools().ComputeStat( GetEventCollection(eTT), mvaRes,
1220  meanS, meanB, rmsS, rmsB, xmin, xmax, signalClass );
1221 
1222  fNbins = gConfig().fVariablePlotting.fNbinsXOfROCCurve;
1223  xmin = TMath::Max( TMath::Min(meanS - nrms*rmsS, meanB - nrms*rmsB ), xmin );
1224  xmax = TMath::Min( TMath::Max(meanS + nrms*rmsS, meanB + nrms*rmsB ), xmax ) + 0.0001;
1225 
1226  // calculate ROC integral
1227  TH1* mva_s = new TH1F( "MVA_S", "MVA_S", fNbins, xmin, xmax );
1228  TH1* mva_b = new TH1F( "MVA_B", "MVA_B", fNbins, xmin, xmax );
1229  TH1 *mva_s_overlap=0, *mva_b_overlap=0;
1230  if (CalcOverlapIntergral) {
1231  mva_s_overlap = new TH1F( "MVA_S_OVERLAP", "MVA_S_OVERLAP", fNbins, xmin, xmax );
1232  mva_b_overlap = new TH1F( "MVA_B_OVERLAP", "MVA_B_OVERLAP", fNbins, xmin, xmax );
1233  }
1234  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1235  const Event* ev = GetEvent(ievt);
1236  Float_t w = (eTT==Types::kTesting ? ev->GetWeight() : ev->GetOriginalWeight());
1237  if (DataInfo().IsSignal(ev)) mva_s->Fill( (*mvaRes)[ievt], w );
1238  else mva_b->Fill( (*mvaRes)[ievt], w );
1239 
1240  if (CalcOverlapIntergral) {
1241  Float_t w_ov = ev->GetWeight();
1242  if (DataInfo().IsSignal(ev))
1243  mva_s_overlap->Fill( (*mvaRes)[ievt], w_ov );
1244  else
1245  mva_b_overlap->Fill( (*mvaRes)[ievt], w_ov );
1246  }
1247  }
1248  gTools().NormHist( mva_s );
1249  gTools().NormHist( mva_b );
1250  PDF *fS = new PDF( "PDF Sig", mva_s, PDF::kSpline2 );
1251  PDF *fB = new PDF( "PDF Bkg", mva_b, PDF::kSpline2 );
1252 
1253  // calculate ROC integral from fS, fB
1254  Double_t ROC = MethodBase::GetROCIntegral(fS, fB);
1255 
1256  // calculate overlap integral
1257  if (CalcOverlapIntergral) {
1258  gTools().NormHist( mva_s_overlap );
1259  gTools().NormHist( mva_b_overlap );
1260 
1261  fOverlap_integral = 0.0;
1262  for (Int_t bin=1; bin<=mva_s_overlap->GetNbinsX(); bin++){
1263  Double_t bc_s = mva_s_overlap->GetBinContent(bin);
1264  Double_t bc_b = mva_b_overlap->GetBinContent(bin);
1265  if (bc_s > 0.0 && bc_b > 0.0)
1266  fOverlap_integral += TMath::Min(bc_s, bc_b);
1267  }
1268 
1269  delete mva_s_overlap;
1270  delete mva_b_overlap;
1271  }
1272 
1273  delete mva_s;
1274  delete mva_b;
1275  delete fS;
1276  delete fB;
1277  if (!(singleMethod && eTT==Types::kTraining)) delete mvaRes;
1278 
1279  Data()->SetCurrentType(Types::kTraining);
1280 
1281  return ROC;
1282 }
1283 
1284 void TMVA::MethodBoost::CalcMVAValues()
1285 {
1286  // Calculate MVA values of current method fMethods.back() on
1287  // training sample
1288 
1289  Data()->SetCurrentType(Types::kTraining);
1290  MethodBase* method = dynamic_cast<MethodBase*>(fMethods.back());
1291  if (!method) {
1292  Log() << kFATAL << "dynamic cast to MethodBase* failed" <<Endl;
1293  return;
1294  }
1295  // calculate MVA values
1296  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1297  GetEvent(ievt);
1298  fMVAvalues->at(ievt) = method->GetMvaValue();
1299  }
1300 
1301  // fill cumulative mva distribution
1302 
1303 
1304 }
1305 
1306 ////////////////////////////////////////////////////////////////////////////////
1307 /// fill various monitoring histograms from information of the individual classifiers that
1308 /// have been boosted.
1309 /// of course.... this depends very much on the individual classifiers, and so far, only for
1310 /// Decision Trees, this monitoring is actually implemented
1311 
1312 void TMVA::MethodBoost::MonitorBoost( Types::EBoostStage stage , UInt_t methodIndex )
1313 {
1314  Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
1315 
1316  if (GetCurrentMethod(methodIndex)->GetMethodType() == TMVA::Types::kDT) {
1317  TMVA::MethodDT* currentDT=dynamic_cast<TMVA::MethodDT*>(GetCurrentMethod(methodIndex));
1318  if (currentDT){
1319  if (stage == Types::kBoostProcBegin){
1320  results->Store(new TH1I("NodesBeforePruning","nodes before pruning",this->GetBoostNum(),0,this->GetBoostNum()),"NodesBeforePruning");
1321  results->Store(new TH1I("NodesAfterPruning","nodes after pruning",this->GetBoostNum(),0,this->GetBoostNum()),"NodesAfterPruning");
1322  }
1323 
1324  if (stage == Types::kBeforeTraining){
1325  }
1326  else if (stage == Types::kBeforeBoosting){
1327  results->GetHist("NodesBeforePruning")->SetBinContent(methodIndex+1,currentDT->GetNNodesBeforePruning());
1328  results->GetHist("NodesAfterPruning")->SetBinContent(methodIndex+1,currentDT->GetNNodes());
1329  }
1330  else if (stage == Types::kAfterBoosting){
1331 
1332  }
1333  else if (stage != Types::kBoostProcEnd){
1334  Log() << kINFO << "<Train> average number of nodes before/after pruning : "
1335  << results->GetHist("NodesBeforePruning")->GetMean() << " / "
1336  << results->GetHist("NodesAfterPruning")->GetMean()
1337  << Endl;
1338  }
1339  }
1340 
1341  }else if (GetCurrentMethod(methodIndex)->GetMethodType() == TMVA::Types::kFisher) {
1342  if (stage == Types::kAfterBoosting){
1343  TMVA::MsgLogger::EnableOutput();
1344  }
1345  }else{
1346  if (methodIndex < 3){
1347  Log() << kDEBUG << "No detailed boost monitoring for "
1348  << GetCurrentMethod(methodIndex)->GetMethodName()
1349  << " yet available " << Endl;
1350  }
1351  }
1352 
1353  //boosting plots universal for all classifiers 'typically for debug purposes only as they are not general enough'
1354 
1355  if (stage == Types::kBeforeBoosting){
1356  // if you want to display the weighted events for 2D case at each boost step:
1357  if (fDetailedMonitoring){
1358  // the following code is useful only for 2D examples - mainly illustration for debug/educational purposes:
1359  if (DataInfo().GetNVariables() == 2) {
1360  results->Store(new TH2F(Form("EventDistSig_%d",methodIndex),Form("EventDistSig_%d",methodIndex),100,0,7,100,0,7));
1361  results->GetHist(Form("EventDistSig_%d",methodIndex))->SetMarkerColor(4);
1362  results->Store(new TH2F(Form("EventDistBkg_%d",methodIndex),Form("EventDistBkg_%d",methodIndex),100,0,7,100,0,7));
1363  results->GetHist(Form("EventDistBkg_%d",methodIndex))->SetMarkerColor(2);
1364 
1365  Data()->SetCurrentType(Types::kTraining);
1366  for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1367  const Event* ev = GetEvent(ievt);
1368  Float_t w = ev->GetWeight();
1369  Float_t v0= ev->GetValue(0);
1370  Float_t v1= ev->GetValue(1);
1371  // if (ievt<3) std::cout<<ievt<<" var0="<<v0<<" var1="<<v1<<" weight="<<w<<std::endl;
1372  TH2* h;
1373  if (DataInfo().IsSignal(ev)) h=results->GetHist2D(Form("EventDistSig_%d",methodIndex));
1374  else h=results->GetHist2D(Form("EventDistBkg_%d",methodIndex));
1375  if (h) h->Fill(v0,v1,w);
1376  }
1377  }
1378  }
1379  }
1380 
1381  return;
1382 }
1383 
1384