124 ClassImp(TMVA::MethodDT);
129 TMVA::MethodDT::MethodDT( const TString& jobName,
130 const TString& methodTitle,
131 DataSetInfo& theData,
132 const TString& theOption) :
133 TMVA::MethodBase( jobName, Types::kDT, methodTitle, theData, theOption)
139 , fUseYesNoLeaf(kFALSE)
140 , fNodePurityLimit(0)
144 , fPruneMethod(DecisionTree::kNoPruning)
146 , fRandomisedTrees(kFALSE)
148 , fUsePoissonNvars(0)
149 , fDeltaPruneStrength(0)
151 fPruneBeforeBoost = kFALSE;
157 TMVA::MethodDT::MethodDT( DataSetInfo& dsi,
158 const TString& theWeightFile) :
159 TMVA::MethodBase( Types::kDT, dsi, theWeightFile)
165 , fUseYesNoLeaf(kFALSE)
166 , fNodePurityLimit(0)
170 , fPruneMethod(DecisionTree::kNoPruning)
172 , fRandomisedTrees(kFALSE)
174 , fDeltaPruneStrength(0)
176 fPruneBeforeBoost = kFALSE;
182 Bool_t TMVA::MethodDT::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t )
184 if( type == Types::kClassification && numberClasses == 2 )
return kTRUE;
214 void TMVA::MethodDT::DeclareOptions()
216 DeclareOptionRef(fRandomisedTrees,
"UseRandomisedTrees",
"Choose at each node splitting a random set of variables and *bagging*");
217 DeclareOptionRef(fUseNvars,
"UseNvars",
"Number of variables used if randomised Tree option is chosen");
218 DeclareOptionRef(fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
219 DeclareOptionRef(fUseYesNoLeaf=kTRUE,
"UseYesNoLeaf",
220 "Use Sig or Bkg node type or the ratio S/B as classification in the leaf node");
221 DeclareOptionRef(fNodePurityLimit=0.5,
"NodePurityLimit",
"In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
222 DeclareOptionRef(fSepTypeS=
"GiniIndex",
"SeparationType",
"Separation criterion for node splitting");
223 AddPreDefVal(TString(
"MisClassificationError"));
224 AddPreDefVal(TString(
"GiniIndex"));
225 AddPreDefVal(TString(
"CrossEntropy"));
226 AddPreDefVal(TString(
"SDivSqrtSPlusB"));
227 DeclareOptionRef(fMinNodeEvents=-1,
"nEventsMin",
"deprecated !!! Minimum number of events required in a leaf node");
228 DeclareOptionRef(fMinNodeSizeS,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 10%, Regression: 1%)");
229 DeclareOptionRef(fNCuts,
"nCuts",
"Number of steps during node cut optimisation");
230 DeclareOptionRef(fPruneStrength,
"PruneStrength",
"Pruning strength (negative value == automatic adjustment)");
231 DeclareOptionRef(fPruneMethodS=
"NoPruning",
"PruneMethod",
"Pruning method: NoPruning (switched off), ExpectedError or CostComplexity");
233 AddPreDefVal(TString(
"NoPruning"));
234 AddPreDefVal(TString(
"ExpectedError"));
235 AddPreDefVal(TString(
"CostComplexity"));
237 if (DoRegression()) {
238 DeclareOptionRef(fMaxDepth=50,
"MaxDepth",
"Max depth of the decision tree allowed");
240 DeclareOptionRef(fMaxDepth=3,
"MaxDepth",
"Max depth of the decision tree allowed");
247 void TMVA::MethodDT::DeclareCompatibilityOptions() {
249 MethodBase::DeclareCompatibilityOptions();
251 DeclareOptionRef(fPruneBeforeBoost=kFALSE,
"PruneBeforeBoost",
252 "--> removed option .. only kept for reader backward compatibility");
258 void TMVA::MethodDT::ProcessOptions()
261 if (fSepTypeS ==
"misclassificationerror") fSepType =
new MisClassificationError();
262 else if (fSepTypeS ==
"giniindex") fSepType =
new GiniIndex();
263 else if (fSepTypeS ==
"crossentropy") fSepType =
new CrossEntropy();
264 else if (fSepTypeS ==
"sdivsqrtsplusb") fSepType =
new SdivSqrtSplusB();
266 Log() << kINFO << GetOptions() << Endl;
267 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option called" << Endl;
272 fPruneMethodS.ToLower();
273 if (fPruneMethodS ==
"expectederror" ) fPruneMethod = DecisionTree::kExpectedErrorPruning;
274 else if (fPruneMethodS ==
"costcomplexity" ) fPruneMethod = DecisionTree::kCostComplexityPruning;
275 else if (fPruneMethodS ==
"nopruning" ) fPruneMethod = DecisionTree::kNoPruning;
277 Log() << kINFO << GetOptions() << Endl;
278 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod option:" << fPruneMethodS <<
" called" << Endl;
281 if (fPruneStrength < 0) fAutomatic = kTRUE;
282 else fAutomatic = kFALSE;
283 if (fAutomatic && fPruneMethod == DecisionTree::kExpectedErrorPruning){
285 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" << Endl;
289 if (this->Data()->HasNegativeEventWeights()){
290 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. "
291 <<
"That should in principle be fine as long as on average you end up with "
292 <<
"something positive. For this you have to make sure that the minimal number "
293 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
295 <<
", (or the deprecated equivalent nEventsMin) you can set this via the "
296 <<
"MethodDT option string when booking the "
297 <<
"classifier) is large enough to allow for reasonable averaging!!! "
298 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
299 <<
"which ignores events with negative weight in the training. " << Endl
300 << Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" << Endl;
303 if (fRandomisedTrees){
304 Log() << kINFO <<
" Randomised trees should use *bagging* as *boost* method. Did you set this in the *MethodBoost* ? . Here I can enforce only the *no pruning*" << Endl;
305 fPruneMethod = DecisionTree::kNoPruning;
309 if (fMinNodeEvents > 0){
310 fMinNodeSize = fMinNodeEvents / Data()->GetNTrainingEvents() * 100;
311 Log() << kWARNING <<
"You have explicitly set *nEventsMin*, the min absolute number \n"
312 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n"
313 <<
"*MinNodeSize* giving the relative number as percentage of training \n"
314 <<
"events instead. \n"
315 <<
"nEventsMin="<<fMinNodeEvents<<
"--> MinNodeSize="<<fMinNodeSize<<
"%"
318 SetMinNodeSize(fMinNodeSizeS);
322 void TMVA::MethodDT::SetMinNodeSize(Double_t sizeInPercent){
323 if (sizeInPercent > 0 && sizeInPercent < 50){
324 fMinNodeSize=sizeInPercent;
327 Log() << kERROR <<
"you have demanded a minimal node size of "
328 << sizeInPercent <<
"% of the training events.. \n"
329 <<
" that somehow does not make sense "<<Endl;
333 void TMVA::MethodDT::SetMinNodeSize(TString sizeInPercent){
334 sizeInPercent.ReplaceAll(
"%",
"");
335 if (sizeInPercent.IsAlnum()) SetMinNodeSize(sizeInPercent.Atof());
337 Log() << kERROR <<
"I had problems reading the option MinNodeEvents, which\n"
338 <<
"after removing a possible % sign now reads " << sizeInPercent << Endl;
345 void TMVA::MethodDT::Init(
void )
349 fMinNodeSizeS =
"5%";
351 fPruneMethod = DecisionTree::kNoPruning;
353 fDeltaPruneStrength=0.1;
354 fRandomisedTrees= kFALSE;
355 fUseNvars = GetNvar();
356 fUsePoissonNvars = kTRUE;
359 SetSignalReferenceCut( 0 );
360 if (fAnalysisType == Types::kClassification || fAnalysisType == Types::kMulticlass ) {
370 TMVA::MethodDT::~MethodDT(
void )
377 void TMVA::MethodDT::Train(
void )
379 TMVA::DecisionTreeNode::fgIsTraining=
true;
380 fTree =
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), 0,
381 fRandomisedTrees, fUseNvars, fUsePoissonNvars,fMaxDepth,0 );
382 fTree->SetNVars(GetNvar());
383 if (fRandomisedTrees) Log()<<kWARNING<<
" randomised Trees do not work yet in this framework,"
384 <<
" as I do not know how to give each tree a new random seed, now they"
385 <<
" will be all the same and that is not good " << Endl;
386 fTree->SetAnalysisType( GetAnalysisType() );
389 Data()->SetCurrentType(Types::kTraining);
390 UInt_t nevents = Data()->GetNTrainingEvents();
391 std::vector<const TMVA::Event*> tmp;
392 for (Long64_t ievt=0; ievt<nevents; ievt++) {
393 const Event *
event = GetEvent(ievt);
394 tmp.push_back(event);
396 fTree->BuildTree(tmp);
397 if (fPruneMethod != DecisionTree::kNoPruning) fTree->PruneTree();
399 TMVA::DecisionTreeNode::fgIsTraining=
false;
408 Double_t TMVA::MethodDT::PruneTree( )
413 if (fAutomatic && fPruneMethod == DecisionTree::kCostComplexityPruning) {
414 CCPruner* pruneTool =
new CCPruner(fTree, this->Data() , fSepType);
415 pruneTool->Optimize();
416 std::vector<DecisionTreeNode*> nodes = pruneTool->GetOptimalPruneSequence();
417 fPruneStrength = pruneTool->GetOptimalPruneStrength();
418 for(UInt_t i = 0; i < nodes.size(); i++)
419 fTree->PruneNode(nodes[i]);
422 else if (fAutomatic && fPruneMethod != DecisionTree::kCostComplexityPruning){
499 fTree->SetPruneStrength(fPruneStrength);
503 return fPruneStrength;
508 Double_t TMVA::MethodDT::TestTreeQuality( DecisionTree *dt )
510 Data()->SetCurrentType(Types::kValidation);
512 Double_t SumCorrect=0,SumWrong=0;
513 for (Long64_t ievt=0; ievt<Data()->GetNEvents(); ievt++)
515 const Event * ev = Data()->GetEvent(ievt);
516 if ((dt->CheckEvent(ev) > dt->GetNodePurityLimit() ) == DataInfo().IsSignal(ev)) SumCorrect+=ev->GetWeight();
517 else SumWrong+=ev->GetWeight();
519 Data()->SetCurrentType(Types::kTraining);
520 return SumCorrect / (SumCorrect + SumWrong);
525 void TMVA::MethodDT::AddWeightsXMLTo(
void* parent )
const
527 fTree->AddXMLTo(parent);
533 void TMVA::MethodDT::ReadWeightsFromXML(
void* wghtnode)
537 fTree =
new DecisionTree();
538 fTree->ReadXML(wghtnode,GetTrainingTMVAVersionCode());
543 void TMVA::MethodDT::ReadWeightsFromStream( std::istream& istr )
546 fTree =
new DecisionTree();
553 Double_t TMVA::MethodDT::GetMvaValue( Double_t* err, Double_t* errUpper )
556 NoErrorCalc(err, errUpper);
558 return fTree->CheckEvent(GetEvent(),fUseYesNoLeaf);
563 void TMVA::MethodDT::GetHelpMessage()
const
568 const TMVA::Ranking* TMVA::MethodDT::CreateRanking()