47 Bool_t MethodC50::IsModuleLoaded = ROOT::R::TRInterface::Instance().Require("C50");
50 MethodC50::MethodC50(const TString &jobName,
51 const TString &methodTitle,
53 const TString &theOption) : RMethodBase(jobName, Types::kC50, methodTitle, dsi, theOption),
57 predict("predict.C5.0"),
60 C50Control("C5.0Control"),
61 asfactor("as.factor"),
67 fControlSubset = kTRUE;
69 fControlWinnow = kFALSE;
70 fControlNoGlobalPruning = kFALSE;
73 fControlFuzzyThreshold = kFALSE;
75 r[
"sample.int(4096, size = 1) - 1L"] >> fControlSeed;
76 fControlEarlyStopping = kTRUE;
78 ListOfVariables = DataInfo().GetListOfVariables();
82 MethodC50::MethodC50(DataSetInfo &theData,
const TString &theWeightFile)
83 : RMethodBase(Types::kC50, theData, theWeightFile),
87 predict(
"predict.C5.0"),
89 C50Control(
"C5.0Control"),
90 asfactor(
"as.factor"),
95 fControlSubset = kTRUE;
97 fControlWinnow = kFALSE;
98 fControlNoGlobalPruning = kFALSE;
100 fControlMinCases = 2;
101 fControlFuzzyThreshold = kFALSE;
103 r[
"sample.int(4096, size = 1) - 1L"] >> fControlSeed;
104 fControlEarlyStopping = kTRUE;
109 MethodC50::~MethodC50(
void)
111 if (fModel)
delete fModel;
115 Bool_t MethodC50::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t )
117 if (type == Types::kClassification && numberClasses == 2)
return kTRUE;
123 void MethodC50::Init()
126 if (!IsModuleLoaded) {
127 Error(
"Init",
"R's package C50 can not be loaded.");
128 Log() << kFATAL <<
" R's package C50 can not be loaded."
134 void MethodC50::Train()
136 if (Data()->GetNTrainingEvents() == 0) Log() << kFATAL <<
"<Train> Data() has zero events" << Endl;
137 SEXP Model = C50(ROOT::R::Label[
"x"] = fDfTrain, \
138 ROOT::R::Label[
"y"] = asfactor(fFactorTrain), \
139 ROOT::R::Label[
"trials"] = fNTrials, \
140 ROOT::R::Label[
"rules"] = fRules, \
141 ROOT::R::Label[
"weights"] = fWeightTrain, \
142 ROOT::R::Label[
"control"] = fModelControl);
143 fModel =
new ROOT::R::TRObject(Model);
144 if (IsModelPersistence())
146 TString path = GetWeightFileDir() +
"/" + GetName() +
".RData";
148 Log() << gTools().Color(
"bold") <<
"--- Saving State File In:" << gTools().Color(
"reset") << path << Endl;
150 r[
"C50Model"] << Model;
151 r <<
"save(C50Model,file='" + path +
"')";
156 void MethodC50::DeclareOptions()
159 DeclareOptionRef(fNTrials,
"NTrials",
"An integer specifying the number of boosting iterations");
160 DeclareOptionRef(fRules,
"Rules",
"A logical: should the tree be decomposed into a rule-basedmodel?");
163 DeclareOptionRef(fControlSubset,
"ControlSubset",
"A logical: should the model evaluate groups of discrete \
164 predictors for splits? Note: the C5.0 command line version defaults this \
165 parameter to ‘FALSE’, meaning no attempted gropings will be evaluated \
166 during the tree growing stage.");
167 DeclareOptionRef(fControlBands,
"ControlBands",
"An integer between 2 and 1000. If ‘TRUE’, the model orders \
168 the rules by their affect on the error rate and groups the \
169 rules into the specified number of bands. This modifies the \
170 output so that the effect on the error rate can be seen for \
171 the groups of rules within a band. If this options is \
172 selected and ‘rules = kFALSE’, a warning is issued and ‘rules’ \
173 is changed to ‘kTRUE’.");
174 DeclareOptionRef(fControlWinnow,
"ControlWinnow",
"A logical: should predictor winnowing (i.e feature selection) be used?");
175 DeclareOptionRef(fControlNoGlobalPruning,
"ControlNoGlobalPruning",
"A logical to toggle whether the final, global pruning \
176 step to simplify the tree.");
177 DeclareOptionRef(fControlCF,
"ControlCF",
"A number in (0, 1) for the confidence factor.");
178 DeclareOptionRef(fControlMinCases,
"ControlMinCases",
"an integer for the smallest number of samples that must be \
179 put in at least two of the splits.");
181 DeclareOptionRef(fControlFuzzyThreshold,
"ControlFuzzyThreshold",
"A logical toggle to evaluate possible advanced splits \
182 of the data. See Quinlan (1993) for details and examples.");
183 DeclareOptionRef(fControlSample,
"ControlSample",
"A value between (0, .999) that specifies the random \
184 proportion of the data should be used to train the model. By \
185 default, all the samples are used for model training. Samples \
186 not used for training are used to evaluate the accuracy of \
187 the model in the printed output.");
188 DeclareOptionRef(fControlSeed,
"ControlSeed",
" An integer for the random number seed within the C code.");
189 DeclareOptionRef(fControlEarlyStopping,
"ControlEarlyStopping",
" A logical to toggle whether the internal method for \
190 stopping boosting should be used.");
196 void MethodC50::ProcessOptions()
199 Log() << kERROR <<
" fNTrials <=0... that does not work !! "
200 <<
" I set it to 1 .. just so that the program does not crash"
204 fModelControl = C50Control(ROOT::R::Label[
"subset"] = fControlSubset, \
205 ROOT::R::Label[
"bands"] = fControlBands, \
206 ROOT::R::Label[
"winnow"] = fControlWinnow, \
207 ROOT::R::Label[
"noGlobalPruning"] = fControlNoGlobalPruning, \
208 ROOT::R::Label[
"CF"] = fControlCF, \
209 ROOT::R::Label[
"minCases"] = fControlMinCases, \
210 ROOT::R::Label[
"fuzzyThreshold"] = fControlFuzzyThreshold, \
211 ROOT::R::Label[
"sample"] = fControlSample, \
212 ROOT::R::Label[
"seed"] = fControlSeed, \
213 ROOT::R::Label[
"earlyStopping"] = fControlEarlyStopping);
217 void MethodC50::TestClassification()
219 Log() << kINFO <<
"Testing Classification C50 METHOD " << Endl;
220 MethodBase::TestClassification();
225 Double_t MethodC50::GetMvaValue(Double_t *errLower, Double_t *errUpper)
227 NoErrorCalc(errLower, errUpper);
229 const TMVA::Event *ev = GetEvent();
230 const UInt_t nvar = DataInfo().GetNVariables();
231 ROOT::R::TRDataFrame fDfEvent;
232 for (UInt_t i = 0; i < nvar; i++) {
233 fDfEvent[DataInfo().GetListOfVariables()[i].Data()] = ev->GetValues()[i];
236 if (IsModelPersistence()) ReadStateFromFile();
238 TVectorD result = predict(*fModel, fDfEvent, ROOT::R::Label[
"type"] =
"prob");
239 mvaValue = result[1];
246 std::vector<Double_t> MethodC50::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress)
248 Long64_t nEvents = Data()->GetNEvents();
249 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
250 if (firstEvt < 0) firstEvt = 0;
252 nEvents = lastEvt-firstEvt;
254 UInt_t nvars = Data()->GetNVariables();
257 Timer timer( nEvents, GetName(), kTRUE );
259 Log() << kINFO<<Form(
"Dataset[%s] : ",DataInfo().GetName())<<
"Evaluation of " << GetMethodName() <<
" on "
260 << (Data()->GetCurrentType()==Types::kTraining?
"training":
"testing") <<
" sample (" << nEvents <<
" events)" << Endl;
264 std::vector<std::vector<Float_t> > inputData(nvars);
265 for (UInt_t i = 0; i < nvars; i++) {
266 inputData[i] = std::vector<Float_t>(nEvents);
269 for (Int_t ievt=firstEvt; ievt<lastEvt; ievt++) {
270 Data()->SetCurrentEvent(ievt);
271 const TMVA::Event *e = Data()->GetEvent();
272 assert(nvars == e->GetNVariables());
273 for (UInt_t i = 0; i < nvars; i++) {
274 inputData[i][ievt] = e->GetValue(i);
280 ROOT::R::TRDataFrame evtData;
281 for (UInt_t i = 0; i < nvars; i++) {
282 evtData[DataInfo().GetListOfVariables()[i].Data()] = inputData[i];
285 if (IsModelPersistence()) ReadModelFromFile();
287 std::vector<Double_t> mvaValues(nEvents);
288 ROOT::R::TRObject result = predict(*fModel, evtData, ROOT::R::Label[
"type"] =
"prob");
289 std::vector<Double_t> probValues(2*nEvents);
290 probValues = result.As<std::vector<Double_t>>();
291 assert(probValues.size() == 2*mvaValues.size());
292 std::copy(probValues.begin()+nEvents, probValues.end(), mvaValues.begin() );
295 Log() << kINFO <<Form(
"Dataset[%s] : ",DataInfo().GetName())<<
"Elapsed time for evaluation of " << nEvents <<
" events: "
296 << timer.GetElapsedTime() <<
" " << Endl;
304 void MethodC50::GetHelpMessage()
const
311 Log() << gTools().Color(
"bold") <<
"--- Short description:" << gTools().Color(
"reset") << Endl;
313 Log() <<
"Decision Trees and Rule-Based Models " << Endl;
315 Log() << gTools().Color(
"bold") <<
"--- Performance optimisation:" << gTools().Color(
"reset") << Endl;
318 Log() << gTools().Color(
"bold") <<
"--- Performance tuning via configuration options:" << gTools().Color(
"reset") << Endl;
320 Log() <<
"<None>" << Endl;
324 void TMVA::MethodC50::ReadModelFromFile()
326 ROOT::R::TRInterface::Instance().Require(
"C50");
327 TString path = GetWeightFileDir() +
"/" + GetName() +
".RData";
329 Log() << gTools().Color(
"bold") <<
"--- Loading State File From:" << gTools().Color(
"reset") << path << Endl;
331 r <<
"load('" + path +
"')";
333 r[
"C50Model"] >> Model;
334 fModel =
new ROOT::R::TRObject(Model);
339 void TMVA::MethodC50::MakeClass(
const TString &)
const