Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
MethodPyAdaBoost.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyAdaBoost *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * AdaBoost Classifier from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
21 #include "TMVA/MethodPyAdaBoost.h"
22 
23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24 #include <numpy/arrayobject.h>
25 
26 #include "TMVA/Config.h"
27 #include "TMVA/Configurable.h"
28 #include "TMVA/ClassifierFactory.h"
29 #include "TMVA/DataSet.h"
30 #include "TMVA/Event.h"
31 #include "TMVA/IMethod.h"
32 #include "TMVA/MsgLogger.h"
33 #include "TMVA/PDF.h"
34 #include "TMVA/Ranking.h"
35 #include "TMVA/Tools.h"
36 #include "TMVA/Types.h"
37 #include "TMVA/Timer.h"
39 #include "TMVA/Results.h"
40 
41 #include "TMath.h"
42 #include "Riostream.h"
43 #include "TMatrix.h"
44 #include "TMatrixD.h"
45 #include "TVectorD.h"
46 
47 #include <iomanip>
48 #include <fstream>
49 
50 using namespace TMVA;
51 
52 namespace TMVA {
53 namespace Internal {
54 class PyGILRAII {
55  PyGILState_STATE m_GILState;
56 
57 public:
58  PyGILRAII() : m_GILState(PyGILState_Ensure()) {}
59  ~PyGILRAII() { PyGILState_Release(m_GILState); }
60 };
61 } // namespace Internal
62 } // namespace TMVA
63 
64 REGISTER_METHOD(PyAdaBoost)
65 
66 ClassImp(MethodPyAdaBoost);
67 
68 //_______________________________________________________________________
69 MethodPyAdaBoost::MethodPyAdaBoost(const TString &jobName,
70  const TString &methodTitle,
71  DataSetInfo &dsi,
72  const TString &theOption) :
73  PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
74  fBaseEstimator("None"),
75  fNestimators(50),
76  fLearningRate(1.0),
77  fAlgorithm("SAMME.R"),
78  fRandomState("None")
79 {
80 }
81 
82 //_______________________________________________________________________
83 MethodPyAdaBoost::MethodPyAdaBoost(DataSetInfo &theData,
84  const TString &theWeightFile) :
85  PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile),
86  fBaseEstimator("None"),
87  fNestimators(50),
88  fLearningRate(1.0),
89  fAlgorithm("SAMME.R"),
90  fRandomState("None")
91 {
92 }
93 
94 //_______________________________________________________________________
95 MethodPyAdaBoost::~MethodPyAdaBoost(void)
96 {
97 }
98 
99 //_______________________________________________________________________
100 Bool_t MethodPyAdaBoost::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
101 {
102  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
103  if (type == Types::kMulticlass && numberClasses >= 2) return kTRUE;
104  return kFALSE;
105 }
106 
107 //_______________________________________________________________________
108 void MethodPyAdaBoost::DeclareOptions()
109 {
110  MethodBase::DeclareCompatibilityOptions();
111 
112  DeclareOptionRef(fBaseEstimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
113  The base estimator from which the boosted ensemble is built.\
114  Support for sample weighting is required, as well as proper `classes_`\
115  and `n_classes_` attributes.");
116 
117  DeclareOptionRef(fNestimators, "NEstimators", "integer, optional (default=50)\
118  The maximum number of estimators at which boosting is terminated.\
119  In case of perfect fit, the learning procedure is stopped early.");
120 
121  DeclareOptionRef(fLearningRate, "LearningRate", "float, optional (default=1.)\
122  Learning rate shrinks the contribution of each classifier by\
123  ``learning_rate``. There is a trade-off between ``learning_rate`` and\
124  ``n_estimators``.");
125 
126  DeclareOptionRef(fAlgorithm, "Algorithm", "{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
127  If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
128  ``base_estimator`` must support calculation of class probabilities.\
129  If 'SAMME' then use the SAMME discrete boosting algorithm.\
130  The SAMME.R algorithm typically converges faster than SAMME,\
131  achieving a lower test error with fewer boosting iterations.");
132 
133  DeclareOptionRef(fRandomState, "RandomState", "int, RandomState instance or None, optional (default=None)\
134  If int, random_state is the seed used by the random number generator;\
135  If RandomState instance, random_state is the random number generator;\
136  If None, the random number generator is the RandomState instance used\
137  by `np.random`.");
138 
139  DeclareOptionRef(fFilenameClassifier, "FilenameClassifier",
140  "Store trained classifier in this file");
141 }
142 
143 //_______________________________________________________________________
144 // Check options and load them to local python namespace
145 void MethodPyAdaBoost::ProcessOptions()
146 {
147  pBaseEstimator = Eval(fBaseEstimator);
148  if (!pBaseEstimator) {
149  Log() << kFATAL << Form("BaseEstimator = %s ... that does not work!", fBaseEstimator.Data())
150  << " The options are Object or None." << Endl;
151  }
152  PyDict_SetItemString(fLocalNS, "baseEstimator", pBaseEstimator);
153 
154  if (fNestimators <= 0) {
155  Log() << kFATAL << "NEstimators <=0 ... that does not work!" << Endl;
156  }
157  pNestimators = Eval(Form("%i", fNestimators));
158  PyDict_SetItemString(fLocalNS, "nEstimators", pNestimators);
159 
160  if (fLearningRate <= 0) {
161  Log() << kFATAL << "LearningRate <=0 ... that does not work!" << Endl;
162  }
163  pLearningRate = Eval(Form("%f", fLearningRate));
164  PyDict_SetItemString(fLocalNS, "learningRate", pLearningRate);
165 
166  if (fAlgorithm != "SAMME" && fAlgorithm != "SAMME.R") {
167  Log() << kFATAL << Form("Algorithm = %s ... that does not work!", fAlgorithm.Data())
168  << " The options are SAMME of SAMME.R." << Endl;
169  }
170  pAlgorithm = Eval(Form("'%s'", fAlgorithm.Data()));
171  PyDict_SetItemString(fLocalNS, "algorithm", pAlgorithm);
172 
173  pRandomState = Eval(fRandomState);
174  if (!pRandomState) {
175  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", fRandomState.Data())
176  << "If int, random_state is the seed used by the random number generator;"
177  << "If RandomState instance, random_state is the random number generator;"
178  << "If None, the random number generator is the RandomState instance used by `np.random`." << Endl;
179  }
180  PyDict_SetItemString(fLocalNS, "randomState", pRandomState);
181 
182  // If no filename is given, set default
183  if(fFilenameClassifier.IsNull()) {
184  fFilenameClassifier = GetWeightFileDir() + "/PyAdaBoostModel_" + GetName() + ".PyData";
185  }
186 }
187 
188 //_______________________________________________________________________
189 void MethodPyAdaBoost::Init()
190 {
191  TMVA::Internal::PyGILRAII raii;
192  _import_array(); //require to use numpy arrays
193 
194  // Check options and load them to local python namespace
195  ProcessOptions();
196 
197  // Import module for ada boost classifier
198  PyRunString("import sklearn.ensemble");
199 
200  // Get data properties
201  fNvars = GetNVariables();
202  fNoutputs = DataInfo().GetNClasses();
203 }
204 
205 //_______________________________________________________________________
206 void MethodPyAdaBoost::Train()
207 {
208  // Load training data (data, classes, weights) to python arrays
209  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
210  npy_intp dimsData[2];
211  dimsData[0] = fNrowsTraining;
212  dimsData[1] = fNvars;
213  PyArrayObject * fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
214  PyDict_SetItemString(fLocalNS, "trainData", (PyObject*)fTrainData);
215  float *TrainData = (float *)(PyArray_DATA(fTrainData));
216 
217  npy_intp dimsClasses = (npy_intp) fNrowsTraining;
218  PyArrayObject * fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
219  PyDict_SetItemString(fLocalNS, "trainDataClasses", (PyObject*)fTrainDataClasses);
220  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
221 
222  PyArrayObject * fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
223  PyDict_SetItemString(fLocalNS, "trainDataWeights", (PyObject*)fTrainDataWeights);
224  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
225 
226  for (int i = 0; i < fNrowsTraining; i++) {
227  // Fill training data matrix
228  const TMVA::Event *e = Data()->GetTrainingEvent(i);
229  for (UInt_t j = 0; j < fNvars; j++) {
230  TrainData[j + i * fNvars] = e->GetValue(j);
231  }
232 
233  // Fill target classes
234  TrainDataClasses[i] = e->GetClass();
235 
236  // Get event weight
237  TrainDataWeights[i] = e->GetWeight();
238  }
239 
240  // Create classifier object
241  PyRunString("classifier = sklearn.ensemble.AdaBoostClassifier(base_estimator=baseEstimator, n_estimators=nEstimators, learning_rate=learningRate, algorithm=algorithm, random_state=randomState)",
242  "Failed to setup classifier");
243 
244  // Fit classifier
245  // NOTE: We dump the output to a variable so that the call does not pollute stdout
246  PyRunString("dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)", "Failed to train classifier");
247 
248  // Store classifier
249  fClassifier = PyDict_GetItemString(fLocalNS, "classifier");
250  if(fClassifier == 0) {
251  Log() << kFATAL << "Can't create classifier object from AdaBoostClassifier" << Endl;
252  Log() << Endl;
253  }
254 
255  if (IsModelPersistence()) {
256  Log() << Endl;
257  Log() << gTools().Color("bold") << "Saving state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
258  Log() << Endl;
259  Serialize(fFilenameClassifier, fClassifier);
260  }
261 }
262 
263 //_______________________________________________________________________
264 void MethodPyAdaBoost::TestClassification()
265 {
266  MethodBase::TestClassification();
267 }
268 
269 //_______________________________________________________________________
270 std::vector<Double_t> MethodPyAdaBoost::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress)
271 {
272  // Load model if not already done
273  if (fClassifier == 0) ReadModelFromFile();
274 
275  // Determine number of events
276  Long64_t nEvents = Data()->GetNEvents();
277  if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
278  if (firstEvt < 0) firstEvt = 0;
279  nEvents = lastEvt-firstEvt;
280 
281  // use timer
282  Timer timer( nEvents, GetName(), kTRUE );
283 
284  if (logProgress)
285  Log() << kHEADER << Form("[%s] : ",DataInfo().GetName())
286  << "Evaluation of " << GetMethodName() << " on "
287  << (Data()->GetCurrentType() == Types::kTraining ? "training" : "testing")
288  << " sample (" << nEvents << " events)" << Endl;
289 
290  // Get data
291  npy_intp dims[2];
292  dims[0] = nEvents;
293  dims[1] = fNvars;
294  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
295  float *pValue = (float *)(PyArray_DATA(pEvent));
296 
297  for (Int_t ievt=0; ievt<nEvents; ievt++) {
298  Data()->SetCurrentEvent(ievt);
299  const TMVA::Event *e = Data()->GetEvent();
300  for (UInt_t i = 0; i < fNvars; i++) {
301  pValue[ievt * fNvars + i] = e->GetValue(i);
302  }
303  }
304 
305  // Get prediction from classifier
306  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
307  double *proba = (double *)(PyArray_DATA(result));
308 
309  // Return signal probabilities
310  if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
311  for (int i = 0; i < nEvents; ++i) {
312  mvaValues[i] = proba[fNoutputs*i + TMVA::Types::kSignal];
313  }
314 
315  Py_DECREF(pEvent);
316  Py_DECREF(result);
317 
318  if (logProgress) {
319  Log() << kINFO
320  << "Elapsed time for evaluation of " << nEvents << " events: "
321  << timer.GetElapsedTime() << " " << Endl;
322  }
323 
324  return mvaValues;
325 }
326 
327 //_______________________________________________________________________
328 Double_t MethodPyAdaBoost::GetMvaValue(Double_t *errLower, Double_t *errUpper)
329 {
330  // cannot determine error
331  NoErrorCalc(errLower, errUpper);
332 
333  // Load model if not already done
334  if (fClassifier == 0) ReadModelFromFile();
335 
336  // Get current event and load to python array
337  const TMVA::Event *e = Data()->GetEvent();
338  npy_intp dims[2];
339  dims[0] = 1;
340  dims[1] = fNvars;
341  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
342  float *pValue = (float *)(PyArray_DATA(pEvent));
343  for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
344 
345  // Get prediction from classifier
346  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
347  double *proba = (double *)(PyArray_DATA(result));
348 
349  // Return MVA value
350  Double_t mvaValue;
351  mvaValue = proba[TMVA::Types::kSignal]; // getting signal probability
352 
353  Py_DECREF(result);
354  Py_DECREF(pEvent);
355 
356  return mvaValue;
357 }
358 
359 //_______________________________________________________________________
360 std::vector<Float_t>& MethodPyAdaBoost::GetMulticlassValues()
361 {
362  // Load model if not already done
363  if (fClassifier == 0) ReadModelFromFile();
364 
365  // Get current event and load to python array
366  const TMVA::Event *e = Data()->GetEvent();
367  npy_intp dims[2];
368  dims[0] = 1;
369  dims[1] = fNvars;
370  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
371  float *pValue = (float *)(PyArray_DATA(pEvent));
372  for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
373 
374  // Get prediction from classifier
375  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
376  double *proba = (double *)(PyArray_DATA(result));
377 
378  // Return MVA values
379  if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
380  for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
381 
382  return classValues;
383 }
384 
385 //_______________________________________________________________________
386 void MethodPyAdaBoost::ReadModelFromFile()
387 {
388  if (!PyIsInitialized()) {
389  PyInitialize();
390  }
391 
392  Log() << Endl;
393  Log() << gTools().Color("bold") << "Loading state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
394  Log() << Endl;
395 
396  // Load classifier from file
397  Int_t err = UnSerialize(fFilenameClassifier, &fClassifier);
398  if(err != 0)
399  {
400  Log() << kFATAL << Form("Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
401  }
402 
403  // Book classifier object in python dict
404  PyDict_SetItemString(fLocalNS, "classifier", fClassifier);
405 
406  // Load data properties
407  // NOTE: This has to be repeated here for the reader application
408  fNvars = GetNVariables();
409  fNoutputs = DataInfo().GetNClasses();
410 }
411 
412 //_______________________________________________________________________
413 const Ranking* MethodPyAdaBoost::CreateRanking()
414 {
415  // Get feature importance from classifier as an array with length equal
416  // number of variables, higher value signals a higher importance
417  PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(fClassifier, "feature_importances_");
418  // The python object is null if the base estimator does not support
419  // variable ranking. Then, return NULL, which disables ranking.
420  if(pRanking == 0) return NULL;
421 
422  // Fill ranking object and return it
423  fRanking = new Ranking(GetName(), "Variable Importance");
424  Double_t* rankingData = (Double_t*) PyArray_DATA(pRanking);
425  for(UInt_t iVar=0; iVar<fNvars; iVar++){
426  fRanking->AddRank(Rank(GetInputLabel(iVar), rankingData[iVar]));
427  }
428 
429  Py_DECREF(pRanking);
430 
431  return fRanking;
432 }
433 
434 //_______________________________________________________________________
435 void MethodPyAdaBoost::GetHelpMessage() const
436 {
437  // typical length of text line:
438  // "|--------------------------------------------------------------|"
439  Log() << "An AdaBoost classifier is a meta-estimator that begins by fitting" << Endl;
440  Log() << "a classifier on the original dataset and then fits additional copies" << Endl;
441  Log() << "of the classifier on the same dataset but where the weights of incorrectly" << Endl;
442  Log() << "classified instances are adjusted such that subsequent classifiers focus" << Endl;
443  Log() << "more on difficult cases." << Endl;
444  Log() << Endl;
445  Log() << "Check out the scikit-learn documentation for more information." << Endl;
446 }