23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24 #include <numpy/arrayobject.h>
55 PyGILState_STATE m_GILState;
58 PyGILRAII() : m_GILState(PyGILState_Ensure()) {}
59 ~PyGILRAII() { PyGILState_Release(m_GILState); }
64 REGISTER_METHOD(PyAdaBoost)
66 ClassImp(MethodPyAdaBoost);
69 MethodPyAdaBoost::MethodPyAdaBoost(const TString &jobName,
70 const TString &methodTitle,
72 const TString &theOption) :
73 PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
74 fBaseEstimator("None"),
77 fAlgorithm("SAMME.R"),
83 MethodPyAdaBoost::MethodPyAdaBoost(DataSetInfo &theData,
84 const TString &theWeightFile) :
85 PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile),
86 fBaseEstimator(
"None"),
89 fAlgorithm(
"SAMME.R"),
95 MethodPyAdaBoost::~MethodPyAdaBoost(
void)
100 Bool_t MethodPyAdaBoost::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
102 if (type == Types::kClassification && numberClasses == 2)
return kTRUE;
103 if (type == Types::kMulticlass && numberClasses >= 2)
return kTRUE;
108 void MethodPyAdaBoost::DeclareOptions()
110 MethodBase::DeclareCompatibilityOptions();
112 DeclareOptionRef(fBaseEstimator,
"BaseEstimator",
"object, optional (default=DecisionTreeClassifier)\
113 The base estimator from which the boosted ensemble is built.\
114 Support for sample weighting is required, as well as proper `classes_`\
115 and `n_classes_` attributes.");
117 DeclareOptionRef(fNestimators,
"NEstimators",
"integer, optional (default=50)\
118 The maximum number of estimators at which boosting is terminated.\
119 In case of perfect fit, the learning procedure is stopped early.");
121 DeclareOptionRef(fLearningRate,
"LearningRate",
"float, optional (default=1.)\
122 Learning rate shrinks the contribution of each classifier by\
123 ``learning_rate``. There is a trade-off between ``learning_rate`` and\
126 DeclareOptionRef(fAlgorithm,
"Algorithm",
"{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
127 If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
128 ``base_estimator`` must support calculation of class probabilities.\
129 If 'SAMME' then use the SAMME discrete boosting algorithm.\
130 The SAMME.R algorithm typically converges faster than SAMME,\
131 achieving a lower test error with fewer boosting iterations.");
133 DeclareOptionRef(fRandomState,
"RandomState",
"int, RandomState instance or None, optional (default=None)\
134 If int, random_state is the seed used by the random number generator;\
135 If RandomState instance, random_state is the random number generator;\
136 If None, the random number generator is the RandomState instance used\
139 DeclareOptionRef(fFilenameClassifier,
"FilenameClassifier",
140 "Store trained classifier in this file");
145 void MethodPyAdaBoost::ProcessOptions()
147 pBaseEstimator = Eval(fBaseEstimator);
148 if (!pBaseEstimator) {
149 Log() << kFATAL << Form(
"BaseEstimator = %s ... that does not work!", fBaseEstimator.Data())
150 <<
" The options are Object or None." << Endl;
152 PyDict_SetItemString(fLocalNS,
"baseEstimator", pBaseEstimator);
154 if (fNestimators <= 0) {
155 Log() << kFATAL <<
"NEstimators <=0 ... that does not work!" << Endl;
157 pNestimators = Eval(Form(
"%i", fNestimators));
158 PyDict_SetItemString(fLocalNS,
"nEstimators", pNestimators);
160 if (fLearningRate <= 0) {
161 Log() << kFATAL <<
"LearningRate <=0 ... that does not work!" << Endl;
163 pLearningRate = Eval(Form(
"%f", fLearningRate));
164 PyDict_SetItemString(fLocalNS,
"learningRate", pLearningRate);
166 if (fAlgorithm !=
"SAMME" && fAlgorithm !=
"SAMME.R") {
167 Log() << kFATAL << Form(
"Algorithm = %s ... that does not work!", fAlgorithm.Data())
168 <<
" The options are SAMME of SAMME.R." << Endl;
170 pAlgorithm = Eval(Form(
"'%s'", fAlgorithm.Data()));
171 PyDict_SetItemString(fLocalNS,
"algorithm", pAlgorithm);
173 pRandomState = Eval(fRandomState);
175 Log() << kFATAL << Form(
" RandomState = %s... that does not work !! ", fRandomState.Data())
176 <<
"If int, random_state is the seed used by the random number generator;"
177 <<
"If RandomState instance, random_state is the random number generator;"
178 <<
"If None, the random number generator is the RandomState instance used by `np.random`." << Endl;
180 PyDict_SetItemString(fLocalNS,
"randomState", pRandomState);
183 if(fFilenameClassifier.IsNull()) {
184 fFilenameClassifier = GetWeightFileDir() +
"/PyAdaBoostModel_" + GetName() +
".PyData";
189 void MethodPyAdaBoost::Init()
191 TMVA::Internal::PyGILRAII raii;
198 PyRunString(
"import sklearn.ensemble");
201 fNvars = GetNVariables();
202 fNoutputs = DataInfo().GetNClasses();
206 void MethodPyAdaBoost::Train()
209 int fNrowsTraining = Data()->GetNTrainingEvents();
210 npy_intp dimsData[2];
211 dimsData[0] = fNrowsTraining;
212 dimsData[1] = fNvars;
213 PyArrayObject * fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
214 PyDict_SetItemString(fLocalNS,
"trainData", (PyObject*)fTrainData);
215 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
217 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
218 PyArrayObject * fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
219 PyDict_SetItemString(fLocalNS,
"trainDataClasses", (PyObject*)fTrainDataClasses);
220 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
222 PyArrayObject * fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
223 PyDict_SetItemString(fLocalNS,
"trainDataWeights", (PyObject*)fTrainDataWeights);
224 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
226 for (
int i = 0; i < fNrowsTraining; i++) {
228 const TMVA::Event *e = Data()->GetTrainingEvent(i);
229 for (UInt_t j = 0; j < fNvars; j++) {
230 TrainData[j + i * fNvars] = e->GetValue(j);
234 TrainDataClasses[i] = e->GetClass();
237 TrainDataWeights[i] = e->GetWeight();
241 PyRunString(
"classifier = sklearn.ensemble.AdaBoostClassifier(base_estimator=baseEstimator, n_estimators=nEstimators, learning_rate=learningRate, algorithm=algorithm, random_state=randomState)",
242 "Failed to setup classifier");
246 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
249 fClassifier = PyDict_GetItemString(fLocalNS,
"classifier");
250 if(fClassifier == 0) {
251 Log() << kFATAL <<
"Can't create classifier object from AdaBoostClassifier" << Endl;
255 if (IsModelPersistence()) {
257 Log() << gTools().Color(
"bold") <<
"Saving state file: " << gTools().Color(
"reset") << fFilenameClassifier << Endl;
259 Serialize(fFilenameClassifier, fClassifier);
264 void MethodPyAdaBoost::TestClassification()
266 MethodBase::TestClassification();
270 std::vector<Double_t> MethodPyAdaBoost::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress)
273 if (fClassifier == 0) ReadModelFromFile();
276 Long64_t nEvents = Data()->GetNEvents();
277 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
278 if (firstEvt < 0) firstEvt = 0;
279 nEvents = lastEvt-firstEvt;
282 Timer timer( nEvents, GetName(), kTRUE );
285 Log() << kHEADER << Form(
"[%s] : ",DataInfo().GetName())
286 <<
"Evaluation of " << GetMethodName() <<
" on "
287 << (Data()->GetCurrentType() == Types::kTraining ?
"training" :
"testing")
288 <<
" sample (" << nEvents <<
" events)" << Endl;
294 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
295 float *pValue = (
float *)(PyArray_DATA(pEvent));
297 for (Int_t ievt=0; ievt<nEvents; ievt++) {
298 Data()->SetCurrentEvent(ievt);
299 const TMVA::Event *e = Data()->GetEvent();
300 for (UInt_t i = 0; i < fNvars; i++) {
301 pValue[ievt * fNvars + i] = e->GetValue(i);
306 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
307 double *proba = (
double *)(PyArray_DATA(result));
310 if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
311 for (
int i = 0; i < nEvents; ++i) {
312 mvaValues[i] = proba[fNoutputs*i + TMVA::Types::kSignal];
320 <<
"Elapsed time for evaluation of " << nEvents <<
" events: "
321 << timer.GetElapsedTime() <<
" " << Endl;
328 Double_t MethodPyAdaBoost::GetMvaValue(Double_t *errLower, Double_t *errUpper)
331 NoErrorCalc(errLower, errUpper);
334 if (fClassifier == 0) ReadModelFromFile();
337 const TMVA::Event *e = Data()->GetEvent();
341 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
342 float *pValue = (
float *)(PyArray_DATA(pEvent));
343 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
346 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
347 double *proba = (
double *)(PyArray_DATA(result));
351 mvaValue = proba[TMVA::Types::kSignal];
360 std::vector<Float_t>& MethodPyAdaBoost::GetMulticlassValues()
363 if (fClassifier == 0) ReadModelFromFile();
366 const TMVA::Event *e = Data()->GetEvent();
370 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
371 float *pValue = (
float *)(PyArray_DATA(pEvent));
372 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
375 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
376 double *proba = (
double *)(PyArray_DATA(result));
379 if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
380 for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
386 void MethodPyAdaBoost::ReadModelFromFile()
388 if (!PyIsInitialized()) {
393 Log() << gTools().Color(
"bold") <<
"Loading state file: " << gTools().Color(
"reset") << fFilenameClassifier << Endl;
397 Int_t err = UnSerialize(fFilenameClassifier, &fClassifier);
400 Log() << kFATAL << Form(
"Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
404 PyDict_SetItemString(fLocalNS,
"classifier", fClassifier);
408 fNvars = GetNVariables();
409 fNoutputs = DataInfo().GetNClasses();
413 const Ranking* MethodPyAdaBoost::CreateRanking()
417 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(fClassifier,
"feature_importances_");
420 if(pRanking == 0)
return NULL;
423 fRanking =
new Ranking(GetName(),
"Variable Importance");
424 Double_t* rankingData = (Double_t*) PyArray_DATA(pRanking);
425 for(UInt_t iVar=0; iVar<fNvars; iVar++){
426 fRanking->AddRank(Rank(GetInputLabel(iVar), rankingData[iVar]));
435 void MethodPyAdaBoost::GetHelpMessage()
const
439 Log() <<
"An AdaBoost classifier is a meta-estimator that begins by fitting" << Endl;
440 Log() <<
"a classifier on the original dataset and then fits additional copies" << Endl;
441 Log() <<
"of the classifier on the same dataset but where the weights of incorrectly" << Endl;
442 Log() <<
"classified instances are adjusted such that subsequent classifiers focus" << Endl;
443 Log() <<
"more on difficult cases." << Endl;
445 Log() <<
"Check out the scikit-learn documentation for more information." << Endl;