48 ClassImp(TMVA::RuleFitAPI);
50 TMVA::RuleFitAPI::RuleFitAPI(
const MethodRuleFit *rfbase,
52 EMsgType minType = kINFO ) :
53 fMethodRuleFit(rfbase),
56 fLogger(
"RuleFitAPI",minType)
60 SetRFWorkDir(rfbase->GetRFWorkDir());
62 SetRFWorkDir(
"./rulefit");
71 TMVA::RuleFitAPI::~RuleFitAPI()
78 void TMVA::RuleFitAPI::WelcomeMessage()
82 <<
"---------------------------------------------------------------------------\n"
83 <<
"- You are running the interface to Jerome Friedmans RuleFit(tm) code. -\n"
84 <<
"- For a full manual see the following web page: -\n"
86 <<
"- http://www-stat.stanford.edu/~jhf/R-RuleFit.html -\n"
88 <<
"---------------------------------------------------------------------------"
94 void TMVA::RuleFitAPI::HowtoSetupRF()
98 <<
"------------------------ RULEFIT-JF INTERFACE SETUP -----------------------\n"
100 <<
"1. Create a rulefit directory in your current work directory:\n"
101 <<
" mkdir " << fRFWorkDir <<
"\n\n"
102 <<
" the directory may be set using the option RuleFitDir\n"
104 <<
"2. Copy (or make a link) the file rf_go.exe into this directory\n"
106 <<
"The file can be obtained from Jerome Friedmans homepage (linux):\n"
107 <<
" wget http://www-stat.stanford.edu/~jhf/r-rulefit/linux/rf_go.exe\n"
109 <<
"Don't forget to do:\n"
110 <<
" chmod +x rf_go.exe\n"
112 <<
"For Windows download:\n"
113 <<
" http://www-stat.stanford.edu/~jhf/r-rulefit/windows/rf_go.exe\n"
115 <<
"NOTE: other platforms are not supported (see Friedmans homepage)\n"
117 <<
"---------------------------------------------------------------------------\n"
124 void TMVA::RuleFitAPI::InitRuleFit()
134 void TMVA::RuleFitAPI::ImportSetup()
136 fRFIntParms.p = fMethodRuleFit->DataInfo().GetNVariables();
137 fRFIntParms.max_rules = fMethodRuleFit->GetRFNrules();
138 fRFIntParms.tree_size = fMethodRuleFit->GetRFNendnodes();
139 fRFIntParms.path_steps = fMethodRuleFit->GetGDNPathSteps();
141 fRFRealParms.path_inc = fMethodRuleFit->GetGDPathStep();
142 fRFRealParms.samp_fract = fMethodRuleFit->GetTreeEveFrac();
143 fRFRealParms.trim_qntl = fMethodRuleFit->GetLinQuantile();
144 fRFRealParms.conv_fac = fMethodRuleFit->GetGDErrScale();
146 if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyLinear() )
147 fRFIntParms.lmode = kRfLinear;
148 else if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyRules() )
149 fRFIntParms.lmode = kRfRules;
151 fRFIntParms.lmode = kRfBoth;
157 void TMVA::RuleFitAPI::SetRFWorkDir(
const char * wdir)
168 void TMVA::RuleFitAPI::CheckRFWorkDir()
170 TString oldDir = gSystem->pwd();
171 if (!gSystem->cd(fRFWorkDir)) {
172 fLogger << kWARNING <<
"Must create a rulefit directory named : " << fRFWorkDir << Endl;
174 fLogger << kFATAL <<
"Setup failed - aborting!" << Endl;
177 FILE *f = fopen(
"rf_go.exe",
"r");
179 fLogger << kWARNING <<
"No rf_go.exe file in directory : " << fRFWorkDir << Endl;
181 fLogger << kFATAL <<
"Setup failed - aborting!" << Endl;
184 gSystem->cd(oldDir.Data());
190 void TMVA::RuleFitAPI::SetTrainParms()
194 Int_t n = fMethodRuleFit->Data()->GetNTrainingEvents();
197 fRFProgram = kRfTrain;
203 void TMVA::RuleFitAPI::SetTestParms()
206 Int_t n = fMethodRuleFit->Data()->GetNTestEvents();
209 fRFProgram = kRfPredict;
215 void TMVA::RuleFitAPI::FillRealParmsDef()
217 fRFRealParms.xmiss = 9.0e30;
218 fRFRealParms.trim_qntl = 0.025;
219 fRFRealParms.huber = 0.8;
220 fRFRealParms.inter_supp = 3.0;
221 fRFRealParms.memory_par = 0.01;
222 fRFRealParms.samp_fract = 0.5;
223 fRFRealParms.path_inc = 0.01;
224 fRFRealParms.conv_fac = 1.1;
230 void TMVA::RuleFitAPI::FillIntParmsDef()
232 fRFIntParms.mode = (int)kRfClass;
233 fRFIntParms.lmode = (int)kRfBoth;
236 fRFIntParms.max_rules = 2000;
237 fRFIntParms.tree_size = 4;
238 fRFIntParms.path_speed = 2;
239 fRFIntParms.path_xval = 3;
240 fRFIntParms.path_steps = 50000;
241 fRFIntParms.path_testfreq = 100;
242 fRFIntParms.tree_store = 10000000;
243 fRFIntParms.cat_store = 1000000;
250 Bool_t TMVA::RuleFitAPI::WriteAll()
257 if (fRFProgram==kRfTrain) WriteTrain();
258 if (fRFProgram==kRfPredict) WriteTest();
259 if (fRFProgram==kRfVarimp) WriteRealVarImp();
266 Bool_t TMVA::RuleFitAPI::WriteIntParms()
269 if (!OpenRFile(
"intparms",f))
return kFALSE;
270 WriteInt(f,&fRFIntParms.mode,
sizeof(fRFIntParms)/
sizeof(Int_t));
277 Bool_t TMVA::RuleFitAPI::WriteRealParms()
280 if (!OpenRFile(
"realparms",f))
return kFALSE;
281 WriteFloat(f,&fRFRealParms.xmiss,
sizeof(fRFRealParms)/
sizeof(Float_t));
293 Bool_t TMVA::RuleFitAPI::WriteLx()
296 fRFLx.resize(fMethodRuleFit->DataInfo().GetNVariables(),1);
299 if (!OpenRFile(
"lx",f))
return kFALSE;
300 WriteInt(f,&fRFLx[0],fRFLx.size());
307 Bool_t TMVA::RuleFitAPI::WriteProgram()
310 if (!OpenRFile(
"program",f))
return kFALSE;
312 switch (fRFProgram) {
317 program =
"rulefit_pred";
324 fRFProgram = kRfTrain;
335 Bool_t TMVA::RuleFitAPI::WriteRealVarImp()
338 if (!OpenRFile(
"realvarimp",f))
return kFALSE;
342 WriteFloat(f,&rvp[0],2);
349 Bool_t TMVA::RuleFitAPI::WriteRfOut()
351 fLogger << kWARNING <<
"WriteRfOut is not yet implemented" << Endl;
358 Bool_t TMVA::RuleFitAPI::WriteRfStatus()
360 fLogger << kWARNING <<
"WriteRfStatus is not yet implemented" << Endl;
367 Bool_t TMVA::RuleFitAPI::WriteRuleFitMod()
369 fLogger << kWARNING <<
"WriteRuleFitMod is not yet implemented" << Endl;
376 Bool_t TMVA::RuleFitAPI::WriteRuleFitSum()
378 fLogger << kWARNING <<
"WriteRuleFitSum is not yet implemented" << Endl;
385 Bool_t TMVA::RuleFitAPI::WriteTrain()
391 if (!OpenRFile(
"train.x",fx))
return kFALSE;
392 if (!OpenRFile(
"train.y",fy))
return kFALSE;
393 if (!OpenRFile(
"train.w",fw))
return kFALSE;
400 for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
401 for (Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNTrainingEvents(); ievt++) {
402 const Event * ev = fMethodRuleFit->GetTrainingEvent(ievt);
403 x = ev->GetValue(ivar);
407 y = fMethodRuleFit->DataInfo().IsSignal(ev)? 1.0 : -1.0;
413 fLogger << kINFO <<
"Number of training data written: " << fMethodRuleFit->Data()->GetNTrainingEvents() << Endl;
420 Bool_t TMVA::RuleFitAPI::WriteTest()
422 fMethodRuleFit->Data()->SetCurrentType(Types::kTesting);
426 if (!OpenRFile(
"test.x",f))
return kFALSE;
431 neve =
static_cast<Float_t
>(fMethodRuleFit->Data()->GetNEvents());
432 WriteFloat(f,&neve,1);
438 for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
439 for (Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNEvents(); ievt++) {
440 vf = fMethodRuleFit->GetEvent(ievt)->GetValue(ivar);
444 fLogger << kINFO <<
"Number of test data written: " << fMethodRuleFit->Data()->GetNEvents() << Endl;
452 Bool_t TMVA::RuleFitAPI::WriteVarNames()
455 if (!OpenRFile(
"varnames",f))
return kFALSE;
456 for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
457 f << fMethodRuleFit->DataInfo().GetVariableInfo(ivar).GetExpression() <<
'\n';
464 Bool_t TMVA::RuleFitAPI::WriteVarImp()
468 fLogger << kWARNING <<
"WriteVarImp is not yet implemented" << Endl;
475 Bool_t TMVA::RuleFitAPI::WriteYhat()
477 fLogger << kWARNING <<
"WriteYhat is not yet implemented" << Endl;
484 Bool_t TMVA::RuleFitAPI::ReadYhat()
489 if (!OpenRFile(
"yhat",f))
return kFALSE;
492 ReadFloat(f,&xval,1);
493 neve =
static_cast<Int_t
>(xval);
494 if (neve!=fMethodRuleFit->Data()->GetNTestEvents()) {
495 fLogger << kWARNING <<
"Inconsistent size of yhat file and test tree!" << Endl;
496 fLogger << kWARNING <<
"neve = " << neve <<
" , tree = " << fMethodRuleFit->Data()->GetNTestEvents() << Endl;
499 for (Int_t ievt=0; ievt<fMethodRuleFit->Data()->GetNTestEvents(); ievt++) {
500 ReadFloat(f,&xval,1);
501 fRFYhat.push_back(xval);
509 Bool_t TMVA::RuleFitAPI::ReadVarImp()
514 if (!OpenRFile(
"varimp",f))
return kFALSE;
518 nvars=fMethodRuleFit->DataInfo().GetNVariables();
522 for (UInt_t ivar=0; ivar<nvars; ivar++) {
523 ReadFloat(f,&xval,1);
527 if (xval>xmax) xmax=xval;
529 fRFVarImp.push_back(xval);
535 for (UInt_t ivar=0; ivar<nvars; ivar++) {
536 fRFVarImp[ivar] = fRFVarImp[ivar]/xmax;
537 ReadFloat(f,&xval,1);
538 fRFVarImpInd.push_back(Int_t(xval)-1);
546 Bool_t TMVA::RuleFitAPI::ReadModelSum()
550 fLogger << kVERBOSE <<
"Reading RuleFit summary file" << Endl;
552 if (!OpenRFile(
"rulefit.sum",f))
return kFALSE;
560 Double_t impref=-1.0;
563 fRuleFit->GetRuleEnsemblePtr()->SetAverageRuleSigma(0.4);
591 lines += ReadInt(f,&nrules);
592 norules = (nrules==1);
593 lines += ReadInt(f,&dumI);
594 norules = norules && (dumI==1);
595 lines += ReadInt(f,&dumI);
596 norules = norules && (dumI==1);
597 lines += ReadInt(f,&dumI);
598 norules = norules && (dumI==0);
599 if (nrules==0) norules=kTRUE;
600 if (norules) nrules = 0;
602 lines += ReadInt(f,&nvars);
603 lines += ReadInt(f,&nvarsOpt);
604 lines += ReadFloat(f,&dumF);
605 lines += ReadFloat(f,&offset);
606 fLogger << kDEBUG <<
"N(rules) = " << nrules << Endl;
607 fLogger << kDEBUG <<
"N(vars) = " << nvars << Endl;
608 fLogger << kDEBUG <<
"N(varsO) = " << nvarsOpt << Endl;
609 fLogger << kDEBUG <<
"xmiss = " << dumF << Endl;
610 fLogger << kDEBUG <<
"offset = " << offset << Endl;
611 if (nvars!=nvarsOpt) {
612 fLogger << kWARNING <<
"Format of rulefit.sum is ... weird?? Continuing but who knows how it will end...?" << Endl;
614 std::vector<Double_t> rfSupp;
615 std::vector<Double_t> rfCoef;
616 std::vector<Int_t> rfNcut;
617 std::vector<Rule *> rfRules;
621 for (Int_t t=0; t<8; t++) {
622 lines += ReadFloat(f,&dumF);
636 for (Int_t r=0; r<nrules; r++) {
637 lines += ReadFloat(f,&dumF);
638 lines += ReadFloat(f,&dumF);
639 rfSupp.push_back(dumF);
640 lines += ReadFloat(f,&dumF);
641 rfCoef.push_back(dumF);
642 lines += ReadFloat(f,&dumF);
643 rfNcut.push_back(static_cast<int>(dumF+0.5));
644 lines += ReadFloat(f,&dumF);
657 for (Int_t r=0; r<nrules; r++) {
661 Rule *rule =
new Rule(fRuleFit->GetRuleEnsemblePtr());
662 rfRules.push_back( rule );
663 RuleCut *rfcut =
new RuleCut();
664 rfcut->SetNvars(rfNcut[r]);
665 rule->SetRuleCut( rfcut );
671 rule->SetSSBNeve(0.0);
672 rule->SetImportanceRef(1.0);
674 rule->SetSSBNeve(0.0);
676 rule->SetSupport(rfSupp[r]);
677 rule->SetCoefficient(rfCoef[r]);
678 rule->CalcImportance();
679 imp = rule->GetImportance();
680 if (imp>impref) impref = imp;
682 fLogger << kDEBUG <<
"Rule #" << r <<
" : " << nvars << Endl;
683 fLogger << kDEBUG <<
" support = " << rfSupp[r] << Endl;
684 fLogger << kDEBUG <<
" sigma = " << rule->GetSigma() << Endl;
685 fLogger << kDEBUG <<
" coeff = " << rfCoef[r] << Endl;
686 fLogger << kDEBUG <<
" N(cut) = " << rfNcut[r] << Endl;
688 for (Int_t c=0; c<rfNcut[r]; c++) {
689 lines += ReadFloat(f,&dumF);
690 varind =
static_cast<Int_t
>(dumF+0.5)-1;
691 lines += ReadFloat(f,&dumF);
692 xmin =
static_cast<Double_t
>(dumF);
693 lines += ReadFloat(f,&dumF);
694 xmax =
static_cast<Double_t
>(dumF);
696 rfcut->SetSelector(c,varind);
697 rfcut->SetCutMin(c,xmin);
698 rfcut->SetCutMax(c,xmax);
701 rfcut->SetCutDoMin(c,(xmin<-8.99e35 ? kFALSE:kTRUE));
702 rfcut->SetCutDoMax(c,(xmax> 8.99e35 ? kFALSE:kTRUE));
706 fRuleFit->GetRuleEnsemblePtr()->SetRules( rfRules );
707 fRuleFit->GetRuleEnsemblePtr()->SetOffset( offset );
720 std::vector<Int_t> varind;
721 std::vector<Double_t> xmin;
722 std::vector<Double_t> xmax;
723 std::vector<Double_t> average;
724 std::vector<Double_t> stdev;
725 std::vector<Double_t> norm;
726 std::vector<Double_t> coeff;
728 for (Int_t c=0; c<nvars; c++) {
729 lines += ReadFloat(f,&dumF);
730 varind.push_back(static_cast<Int_t>(dumF+0.5)-1);
731 lines += ReadFloat(f,&dumF);
732 xmin.push_back(static_cast<Double_t>(dumF));
733 lines += ReadFloat(f,&dumF);
734 xmax.push_back(static_cast<Double_t>(dumF));
735 lines += ReadFloat(f,&dumF);
736 average.push_back(static_cast<Double_t>(dumF));
737 lines += ReadFloat(f,&dumF);
738 stdev.push_back(static_cast<Double_t>(dumF));
739 Double_t nv = fRuleFit->GetRuleEnsemblePtr()->CalcLinNorm(stdev.back());
741 lines += ReadFloat(f,&dumF);
742 coeff.push_back(dumF/nv);
744 fLogger << kDEBUG <<
"Linear #" << c << Endl;
745 fLogger << kDEBUG <<
" varind = " << varind.back() << Endl;
746 fLogger << kDEBUG <<
" xmin = " << xmin.back() << Endl;
747 fLogger << kDEBUG <<
" xmax = " << xmax.back() << Endl;
748 fLogger << kDEBUG <<
" average = " << average.back() << Endl;
749 fLogger << kDEBUG <<
" stdev = " << stdev.back() << Endl;
750 fLogger << kDEBUG <<
" coeff = " << coeff.back() << Endl;
753 fRuleFit->GetRuleEnsemblePtr()->SetLinCoefficients(coeff);
754 fRuleFit->GetRuleEnsemblePtr()->SetLinDM(xmin);
755 fRuleFit->GetRuleEnsemblePtr()->SetLinDP(xmax);
756 fRuleFit->GetRuleEnsemblePtr()->SetLinNorm(norm);
759 imp = fRuleFit->GetRuleEnsemblePtr()->CalcLinImportance();
760 if (imp>impref) impref=imp;
761 fRuleFit->GetRuleEnsemblePtr()->SetImportanceRef(impref);
762 fRuleFit->GetRuleEnsemblePtr()->CleanupLinear();
764 fRuleFit->GetRuleEnsemblePtr()->CalcVarImportance();
767 fLogger << kDEBUG <<
"Reading model done" << Endl;
774 Int_t TMVA::RuleFitAPI::RunRuleFit()
776 TString oldDir = gSystem->pwd();
777 TString cmd =
"./rf_go.exe";
778 gSystem->cd(fRFWorkDir.Data());
779 int rval = gSystem->Exec(cmd.Data());
780 gSystem->cd(oldDir.Data());