23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24 #include <numpy/arrayobject.h>
56 PyGILState_STATE m_GILState;
59 PyGILRAII() : m_GILState(PyGILState_Ensure()) {}
60 ~PyGILRAII() { PyGILState_Release(m_GILState); }
65 REGISTER_METHOD(PyGTB)
67 ClassImp(MethodPyGTB);
70 MethodPyGTB::MethodPyGTB(const TString &jobName,
71 const TString &methodTitle,
73 const TString &theOption) :
74 PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption),
81 fMinWeightFractionLeaf(0.0),
87 fMaxLeafNodes("None"),
93 MethodPyGTB::MethodPyGTB(DataSetInfo &theData,
const TString &theWeightFile)
94 : PyMethodBase(Types::kPyGTB, theData, theWeightFile),
101 fMinWeightFractionLeaf(0.0),
104 fRandomState(
"None"),
105 fMaxFeatures(
"None"),
107 fMaxLeafNodes(
"None"),
114 MethodPyGTB::~MethodPyGTB(
void)
119 Bool_t MethodPyGTB::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
121 if (type == Types::kClassification && numberClasses == 2)
return kTRUE;
122 if (type == Types::kMulticlass && numberClasses >= 2)
return kTRUE;
128 void MethodPyGTB::DeclareOptions()
130 MethodBase::DeclareCompatibilityOptions();
132 DeclareOptionRef(fLoss,
"Loss",
"{'deviance', 'exponential'}, optional (default='deviance')\
133 loss function to be optimized. 'deviance' refers to\
134 deviance (= logistic regression) for classification\
135 with probabilistic outputs. For loss 'exponential' gradient\
136 boosting recovers the AdaBoost algorithm.");
138 DeclareOptionRef(fLearningRate,
"LearningRate",
"float, optional (default=0.1)\
139 learning rate shrinks the contribution of each tree by `learning_rate`.\
140 There is a trade-off between learning_rate and n_estimators.");
142 DeclareOptionRef(fNestimators,
"NEstimators",
"int (default=100)\
143 The number of boosting stages to perform. Gradient boosting\
144 is fairly robust to over-fitting so a large number usually\
145 results in better performance.");
147 DeclareOptionRef(fSubsample,
"Subsample",
"float, optional (default=1.0)\
148 The fraction of samples to be used for fitting the individual base\
149 learners. If smaller than 1.0 this results in Stochastic Gradient\
150 Boosting. `subsample` interacts with the parameter `n_estimators`.\
151 Choosing `subsample < 1.0` leads to a reduction of variance\
152 and an increase in bias.");
154 DeclareOptionRef(fMinSamplesSplit,
"MinSamplesSplit",
"integer, optional (default=2)\
155 The minimum number of samples required to split an internal node.");
157 DeclareOptionRef(fMinSamplesLeaf,
"MinSamplesLeaf",
"integer, optional (default=1) \
158 The minimum number of samples in newly created leaves. A split is \
159 discarded if after the split, one of the leaves would contain less then \
160 ``min_samples_leaf`` samples.");
162 DeclareOptionRef(fMinWeightFractionLeaf,
"MinWeightFractionLeaf",
"//float, optional (default=0.) \
163 The minimum weighted fraction of the input samples required to be at a \
166 DeclareOptionRef(fMaxDepth,
"MaxDepth",
"integer or None, optional (default=None) \
167 The maximum depth of the tree. If None, then nodes are expanded until \
168 all leaves are pure or until all leaves contain less than \
169 min_samples_split samples. \
170 Ignored if ``max_leaf_nodes`` is not None.");
172 DeclareOptionRef(fInit,
"Init",
"BaseEstimator, None, optional (default=None)\
173 An estimator object that is used to compute the initial\
174 predictions. ``init`` has to provide ``fit`` and ``predict``.\
175 If None it uses ``loss.init_estimator`");
177 DeclareOptionRef(fRandomState,
"RandomState",
"int, RandomState instance or None, optional (default=None)\
178 If int, random_state is the seed used by the random number generator;\
179 If RandomState instance, random_state is the random number generator;\
180 If None, the random number generator is the RandomState instance used\
183 DeclareOptionRef(fMaxFeatures,
"MaxFeatures",
"The number of features to consider when looking for the best split");
185 DeclareOptionRef(fVerbose,
"Verbose",
"int, optional (default=0)\
186 Controls the verbosity of the tree building process.");
188 DeclareOptionRef(fMaxLeafNodes,
"MaxLeafNodes",
"int or None, optional (default=None)\
189 Grow trees with ``max_leaf_nodes`` in best-first fashion.\
190 Best nodes are defined as relative reduction in impurity.\
191 If None then unlimited number of leaf nodes.\
192 If not None then ``max_depth`` will be ignored.");
194 DeclareOptionRef(fWarmStart,
"WarmStart",
"bool, optional (default=False)\
195 When set to ``True``, reuse the solution of the previous call to fit\
196 and add more estimators to the ensemble, otherwise, just fit a whole\
199 DeclareOptionRef(fFilenameClassifier,
"FilenameClassifier",
200 "Store trained classifier in this file");
205 void MethodPyGTB::ProcessOptions()
207 if (fLoss !=
"deviance" && fLoss !=
"exponential") {
208 Log() << kFATAL << Form(
"Loss = %s ... that does not work!", fLoss.Data())
209 <<
" The options are 'deviance' or 'exponential'." << Endl;
211 pLoss = Eval(Form(
"'%s'", fLoss.Data()));
212 PyDict_SetItemString(fLocalNS,
"loss", pLoss);
214 if (fLearningRate <= 0) {
215 Log() << kFATAL <<
"LearningRate <= 0 ... that does not work!" << Endl;
217 pLearningRate = Eval(Form(
"%f", fLearningRate));
218 PyDict_SetItemString(fLocalNS,
"learningRate", pLearningRate);
220 if (fNestimators <= 0) {
221 Log() << kFATAL <<
"NEstimators <= 0 ... that does not work!" << Endl;
223 pNestimators = Eval(Form(
"%i", fNestimators));
224 PyDict_SetItemString(fLocalNS,
"nEstimators", pNestimators);
226 if (fMinSamplesSplit < 0) {
227 Log() << kFATAL <<
"MinSamplesSplit < 0 ... that does not work!" << Endl;
229 pMinSamplesSplit = Eval(Form(
"%i", fMinSamplesSplit));
230 PyDict_SetItemString(fLocalNS,
"minSamplesSplit", pMinSamplesSplit);
232 if (fSubsample < 0) {
233 Log() << kFATAL <<
"Subsample < 0 ... that does not work!" << Endl;
235 pSubsample = Eval(Form(
"%f", fSubsample));
236 PyDict_SetItemString(fLocalNS,
"subsample", pSubsample);
238 if (fMinSamplesLeaf < 0) {
239 Log() << kFATAL <<
"MinSamplesLeaf < 0 ... that does not work!" << Endl;
241 pMinSamplesLeaf = Eval(Form(
"%i", fMinSamplesLeaf));
242 PyDict_SetItemString(fLocalNS,
"minSamplesLeaf", pMinSamplesLeaf);
244 if (fMinSamplesSplit < 0) {
245 Log() << kFATAL <<
"MinSamplesSplit < 0 ... that does not work!" << Endl;
247 pMinSamplesSplit = Eval(Form(
"%i", fMinSamplesSplit));
248 PyDict_SetItemString(fLocalNS,
"minSamplesSplit", pMinSamplesSplit);
250 if (fMinWeightFractionLeaf < 0) {
251 Log() << kFATAL <<
"MinWeightFractionLeaf < 0 ... that does not work !" << Endl;
253 pMinWeightFractionLeaf = Eval(Form(
"%f", fMinWeightFractionLeaf));
254 PyDict_SetItemString(fLocalNS,
"minWeightFractionLeaf", pMinWeightFractionLeaf);
256 if (fMaxDepth <= 0) {
257 Log() << kFATAL <<
" MaxDepth <= 0 ... that does not work !! " << Endl;
259 pMaxDepth = Eval(Form(
"%i", fMaxDepth));
260 PyDict_SetItemString(fLocalNS,
"maxDepth", pMaxDepth);
264 Log() << kFATAL << Form(
"Init = %s ... that does not work!", fInit.Data())
265 <<
" The options are None or BaseEstimator, which is an estimator object that"
266 <<
"is used to compute the initial predictions. "
267 <<
"'init' has to provide 'fit' and 'predict' methods."
268 <<
" If None it uses 'loss.init_estimator'." << Endl;
270 PyDict_SetItemString(fLocalNS,
"init", pInit);
272 pRandomState = Eval(fRandomState);
274 Log() << kFATAL << Form(
" RandomState = %s ... that does not work! ", fRandomState.Data())
275 <<
" If int, random_state is the seed used by the random number generator;"
276 <<
" If RandomState instance, random_state is the random number generator;"
277 <<
" If None, the random number generator is the RandomState instance used by 'np.random'."
280 PyDict_SetItemString(fLocalNS,
"randomState", pRandomState);
282 if (fMaxFeatures ==
"auto" || fMaxFeatures ==
"sqrt" || fMaxFeatures ==
"log2"){
283 fMaxFeatures = Form(
"'%s'", fMaxFeatures.Data());
285 pMaxFeatures = Eval(fMaxFeatures);
286 PyDict_SetItemString(fLocalNS,
"maxFeatures", pMaxFeatures);
289 Log() << kFATAL << Form(
" MaxFeatures = %s... that does not work !! ", fMaxFeatures.Data())
290 <<
"int, float, string or None, optional (default='auto')"
291 <<
"The number of features to consider when looking for the best split:"
292 <<
"If int, then consider `max_features` features at each split."
293 <<
"If float, then `max_features` is a percentage and"
294 <<
"`int(max_features * n_features)` features are considered at each split."
295 <<
"If 'auto', then `max_features=sqrt(n_features)`."
296 <<
"If 'sqrt', then `max_features=sqrt(n_features)`."
297 <<
"If 'log2', then `max_features=log2(n_features)`."
298 <<
"If None, then `max_features=n_features`." << Endl;
301 pMaxLeafNodes = Eval(fMaxLeafNodes);
302 if (!pMaxLeafNodes) {
303 Log() << kFATAL << Form(
" MaxLeafNodes = %s... that does not work!", fMaxLeafNodes.Data())
304 <<
" The options are None or integer." << Endl;
306 PyDict_SetItemString(fLocalNS,
"maxLeafNodes", pMaxLeafNodes);
308 pVerbose = Eval(Form(
"%i", fVerbose));
309 PyDict_SetItemString(fLocalNS,
"verbose", pVerbose);
311 pWarmStart = Eval(Form(
"%i", UInt_t(fWarmStart)));
312 PyDict_SetItemString(fLocalNS,
"warmStart", pWarmStart);
315 if(fFilenameClassifier.IsNull()) {
316 fFilenameClassifier = GetWeightFileDir() +
"/PyGTBModel_" + GetName() +
".PyData";
321 void MethodPyGTB::Init()
323 TMVA::Internal::PyGILRAII raii;
330 PyRunString(
"import sklearn.ensemble");
333 fNvars = GetNVariables();
334 fNoutputs = DataInfo().GetNClasses();
337 void MethodPyGTB::Train()
340 int fNrowsTraining = Data()->GetNTrainingEvents();
341 npy_intp dimsData[2];
342 dimsData[0] = fNrowsTraining;
343 dimsData[1] = fNvars;
344 PyArrayObject * fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
345 PyDict_SetItemString(fLocalNS,
"trainData", (PyObject*)fTrainData);
346 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
348 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
349 PyArrayObject * fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
350 PyDict_SetItemString(fLocalNS,
"trainDataClasses", (PyObject*)fTrainDataClasses);
351 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
353 PyArrayObject * fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
354 PyDict_SetItemString(fLocalNS,
"trainDataWeights", (PyObject*)fTrainDataWeights);
355 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
357 for (
int i = 0; i < fNrowsTraining; i++) {
359 const TMVA::Event *e = Data()->GetTrainingEvent(i);
360 for (UInt_t j = 0; j < fNvars; j++) {
361 TrainData[j + i * fNvars] = e->GetValue(j);
365 TrainDataClasses[i] = e->GetClass();
368 TrainDataWeights[i] = e->GetWeight();
372 PyRunString(
"classifier = sklearn.ensemble.GradientBoostingClassifier(loss=loss, learning_rate=learningRate, n_estimators=nEstimators, max_depth=maxDepth, min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, subsample=subsample, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, init=init, verbose=verbose, warm_start=warmStart, random_state=randomState)",
373 "Failed to setup classifier");
377 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
380 fClassifier = PyDict_GetItemString(fLocalNS,
"classifier");
381 if(fClassifier == 0) {
382 Log() << kFATAL <<
"Can't create classifier object from GradientBoostingClassifier" << Endl;
386 if (IsModelPersistence()) {
388 Log() << gTools().Color(
"bold") <<
"Saving state file: " << gTools().Color(
"reset") << fFilenameClassifier << Endl;
390 Serialize(fFilenameClassifier, fClassifier);
395 void MethodPyGTB::TestClassification()
397 MethodBase::TestClassification();
401 std::vector<Double_t> MethodPyGTB::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress)
404 if (fClassifier == 0) ReadModelFromFile();
407 Long64_t nEvents = Data()->GetNEvents();
408 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
409 if (firstEvt < 0) firstEvt = 0;
410 nEvents = lastEvt-firstEvt;
413 Timer timer( nEvents, GetName(), kTRUE );
416 Log() << kHEADER << Form(
"[%s] : ",DataInfo().GetName())
417 <<
"Evaluation of " << GetMethodName() <<
" on "
418 << (Data()->GetCurrentType() == Types::kTraining ?
"training" :
"testing")
419 <<
" sample (" << nEvents <<
" events)" << Endl;
425 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
426 float *pValue = (
float *)(PyArray_DATA(pEvent));
428 for (Int_t ievt=0; ievt<nEvents; ievt++) {
429 Data()->SetCurrentEvent(ievt);
430 const TMVA::Event *e = Data()->GetEvent();
431 for (UInt_t i = 0; i < fNvars; i++) {
432 pValue[ievt * fNvars + i] = e->GetValue(i);
437 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
438 double *proba = (
double *)(PyArray_DATA(result));
441 if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
442 for (
int i = 0; i < nEvents; ++i) {
443 mvaValues[i] = proba[fNoutputs*i + TMVA::Types::kSignal];
451 <<
"Elapsed time for evaluation of " << nEvents <<
" events: "
452 << timer.GetElapsedTime() <<
" " << Endl;
460 Double_t MethodPyGTB::GetMvaValue(Double_t *errLower, Double_t *errUpper)
463 NoErrorCalc(errLower, errUpper);
466 if (fClassifier == 0) ReadModelFromFile();
469 const TMVA::Event *e = Data()->GetEvent();
473 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
474 float *pValue = (
float *)(PyArray_DATA(pEvent));
475 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
478 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
479 double *proba = (
double *)(PyArray_DATA(result));
483 mvaValue = proba[TMVA::Types::kSignal];
492 std::vector<Float_t>& MethodPyGTB::GetMulticlassValues()
495 if (fClassifier == 0) ReadModelFromFile();
498 const TMVA::Event *e = Data()->GetEvent();
502 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
503 float *pValue = (
float *)(PyArray_DATA(pEvent));
504 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
507 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
508 double *proba = (
double *)(PyArray_DATA(result));
511 if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
512 for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
521 void MethodPyGTB::ReadModelFromFile()
523 if (!PyIsInitialized()) {
528 Log() << gTools().Color(
"bold") <<
"Loading state file: " << gTools().Color(
"reset") << fFilenameClassifier << Endl;
532 Int_t err = UnSerialize(fFilenameClassifier, &fClassifier);
535 Log() << kFATAL << Form(
"Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
539 PyDict_SetItemString(fLocalNS,
"classifier", fClassifier);
543 fNvars = GetNVariables();
544 fNoutputs = DataInfo().GetNClasses();
548 const Ranking* MethodPyGTB::CreateRanking()
552 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(fClassifier,
"feature_importances_");
553 if(pRanking == 0) Log() << kFATAL <<
"Failed to get ranking from classifier" << Endl;
556 fRanking =
new Ranking(GetName(),
"Variable Importance");
557 Double_t* rankingData = (Double_t*) PyArray_DATA(pRanking);
558 for(UInt_t iVar=0; iVar<fNvars; iVar++){
559 fRanking->AddRank(Rank(GetInputLabel(iVar), rankingData[iVar]));
568 void MethodPyGTB::GetHelpMessage()
const
572 Log() <<
"A gradient tree boosting classifier builds a model from an ensemble" << Endl;
573 Log() <<
"of decision trees, which are adapted each boosting step to fit better" << Endl;
574 Log() <<
"to previously misclassified events." << Endl;
576 Log() <<
"Check out the scikit-learn documentation for more information." << Endl;