Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RuleFitParams.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : RuleFitParams *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * A class doing the actual fitting of a linear model using rules as *
12  * base functions. *
13  * Reference paper: 1.Gradient Directed Regularization *
14  * Friedman, Popescu, 2004 *
15  * 2.Predictive Learning with Rule Ensembles *
16  * Friedman, Popescu, 2005 *
17  * *
18  * *
19  * Authors (alphabetical): *
20  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
21  * Helge Voss <Helge.Voss@cern.ch> - MPI-KP Heidelberg, Ger. *
22  * *
23  * Copyright (c) 2005: *
24  * CERN, Switzerland *
25  * Iowa State U. *
26  * MPI-K Heidelberg, Germany *
27  * *
28  * Redistribution and use in source and binary forms, with or without *
29  * modification, are permitted according to the terms listed in LICENSE *
30  * (http://tmva.sourceforge.net/LICENSE) *
31  **********************************************************************************/
32 
33 #ifndef ROOT_TMVA_RuleFitParams
34 #define ROOT_TMVA_RuleFitParams
35 
36 #include "TMathBase.h"
37 
38 #include "TMVA/Event.h"
39 
40 class TTree;
41 
42 namespace TMVA {
43 
44  class RuleEnsemble;
45  class MsgLogger;
46  class RuleFit;
47  class RuleFitParams {
48 
49  public:
50 
51  RuleFitParams();
52  virtual ~RuleFitParams();
53 
54  void Init();
55 
56  // set message type
57  void SetMsgType( EMsgType t );
58 
59  // set RuleFit ptr
60  void SetRuleFit( RuleFit *rf ) { fRuleFit = rf; }
61  //
62  // GD path: set N(path steps)
63  void SetGDNPathSteps( Int_t np ) { fGDNPathSteps = np; }
64 
65  // GD path: set path step size
66  void SetGDPathStep( Double_t s ) { fGDPathStep = s; }
67 
68  // GD path: set tau search range
69  void SetGDTauRange( Double_t t0, Double_t t1 )
70  {
71  fGDTauMin = (t0>1.0 ? 1.0:(t0<0.0 ? 0.0:t0));
72  fGDTauMax = (t1>1.0 ? 1.0:(t1<0.0 ? 0.0:t1));
73  if (fGDTauMax<fGDTauMin) fGDTauMax = fGDTauMin;
74  }
75 
76  // GD path: set number of steps in tau search range
77  void SetGDTauScan( UInt_t n ) { fGDTauScan = n; }
78 
79  // GD path: set tau
80  void SetGDTau( Double_t t ) { fGDTau = t; }
81 
82 
83  void SetGDErrScale( Double_t s ) { fGDErrScale = s; }
84  void SetGDTauPrec( Double_t p ) { fGDTauPrec=p; CalcGDNTau(); fGDTauVec.resize(fGDNTau); }
85 
86  // return type such that +1 = signal and -1 = background
87  Int_t Type( const Event * e ) const; // return (fRuleFit->GetMethodRuleFit()->DataInfo().IsSignal(e) ? 1:-1); }
88  //
89  UInt_t GetPathIdx1() const { return fPathIdx1; }
90  UInt_t GetPathIdx2() const { return fPathIdx2; }
91  UInt_t GetPerfIdx1() const { return fPerfIdx1; }
92  UInt_t GetPerfIdx2() const { return fPerfIdx2; }
93 
94  // Loss function; Huber loss eq 33
95  Double_t LossFunction( const Event& e ) const;
96 
97  // same but using evt idx (faster)
98  Double_t LossFunction( UInt_t evtidx ) const;
99  Double_t LossFunction( UInt_t evtidx, UInt_t itau ) const;
100 
101  // Empirical risk
102  Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const;
103  Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff, UInt_t itau) const;
104 
105  // Risk evaluation for fPathIdx and fPerfInd
106  Double_t RiskPath() const { return Risk(fPathIdx1,fPathIdx2,fNEveEffPath); }
107  Double_t RiskPerf() const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf); }
108  Double_t RiskPerf( UInt_t itau ) const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf,itau); }
109 
110  // Risk evaluation for all tau
111  UInt_t RiskPerfTst();
112 
113  // Penalty function; Lasso function (eq 8)
114  Double_t Penalty() const;
115 
116  // initialize GD path
117  void InitGD();
118 
119  // find best tau and return the number of scan steps used
120  Int_t FindGDTau();
121 
122  // make path for binary classification (squared-error ramp, sect 6 in ref 1)
123  void MakeGDPath();
124 
125  protected:
126 
127  // typedef of an Event const iterator
128  typedef std::vector<const TMVA::Event *>::const_iterator EventItr;
129 
130  // init ntuple
131  void InitNtuple();
132 
133  // calculate N(tau) in scan - limit to 100000.
134  void CalcGDNTau() { fGDNTau = static_cast<UInt_t>(1.0/fGDTauPrec)+1; if (fGDNTau>100000) fGDNTau=100000; }
135 
136  // fill ntuple with coefficient info
137  void FillCoefficients();
138 
139  // estimate the optimum scoring function
140  void CalcFStar();
141 
142  // estimate of binary error rate
143  Double_t ErrorRateBin();
144 
145  // estimate of scale average error rate
146  Double_t ErrorRateReg();
147 
148  // estimate 1-area under ROC
149  Double_t ErrorRateRocRaw( std::vector<Double_t> & sFsig, std::vector<Double_t> & sFbkg );
150  Double_t ErrorRateRoc();
151  void ErrorRateRocTst();
152 
153  // estimate optimism
154  Double_t Optimism();
155 
156  // make gradient vector (eq 44 in ref 1)
157  void MakeGradientVector();
158 
159  // Calculate the direction in parameter space (eq 25, ref 1) and update coeffs (eq 22, ref 1)
160  void UpdateCoefficients();
161 
162  // calculate average of responses of F
163  Double_t CalcAverageResponse();
164  Double_t CalcAverageResponseOLD();
165 
166  // calculate average of true response (initial estimate of a0)
167  Double_t CalcAverageTruth();
168 
169  // calculate the average of each variable over the range
170  void EvaluateAverage(UInt_t ind1, UInt_t ind2,
171  std::vector<Double_t> &avsel,
172  std::vector<Double_t> &avrul);
173 
174  // evaluate using fPathIdx1,2
175  void EvaluateAveragePath() { EvaluateAverage( fPathIdx1, fPathIdx2, fAverageSelectorPath, fAverageRulePath ); }
176 
177  // evaluate using fPerfIdx1,2
178  void EvaluateAveragePerf() { EvaluateAverage( fPerfIdx1, fPerfIdx2, fAverageSelectorPerf, fAverageRulePerf ); }
179 
180  // the same as above but for the various tau
181  void MakeTstGradientVector();
182  void UpdateTstCoefficients();
183  void CalcTstAverageResponse();
184 
185 
186  RuleFit * fRuleFit; // rule fit
187  RuleEnsemble * fRuleEnsemble; // rule ensemble
188  //
189  UInt_t fNRules; // number of rules
190  UInt_t fNLinear; // number of linear terms
191  //
192  // Event indices for path/validation - TODO: should let the user decide
193  // Now it is just a simple one-fold cross validation.
194  //
195  UInt_t fPathIdx1; // first event index for path search
196  UInt_t fPathIdx2; // last event index for path search
197  UInt_t fPerfIdx1; // first event index for performance evaluation
198  UInt_t fPerfIdx2; // last event index for performance evaluation
199  Double_t fNEveEffPath; // sum of weights for Path events
200  Double_t fNEveEffPerf; // idem for Perf events
201 
202  std::vector<Double_t> fAverageSelectorPath; // average of each variable over the range fPathIdx1,2
203  std::vector<Double_t> fAverageRulePath; // average of each rule, same range
204  std::vector<Double_t> fAverageSelectorPerf; // average of each variable over the range fPerfIdx1,2
205  std::vector<Double_t> fAverageRulePerf; // average of each rule, same range
206 
207  std::vector<Double_t> fGradVec; // gradient vector - dimension = number of rules in ensemble
208  std::vector<Double_t> fGradVecLin; // gradient vector - dimension = number of variables
209 
210  std::vector< std::vector<Double_t> > fGradVecTst; // gradient vector - one per tau
211  std::vector< std::vector<Double_t> > fGradVecLinTst; // gradient vector, linear terms - one per tau
212  //
213  std::vector<Double_t> fGDErrTst; // error rates per tau
214  std::vector<Char_t> fGDErrTstOK; // error rate is sufficiently low <--- stores boolean
215  std::vector< std::vector<Double_t> > fGDCoefTst; // rule coeffs - one per tau
216  std::vector< std::vector<Double_t> > fGDCoefLinTst; // linear coeffs - one per tau
217  std::vector<Double_t> fGDOfsTst; // offset per tau
218  std::vector< Double_t > fGDTauVec; // the tau's
219  UInt_t fGDNTauTstOK; // number of tau in the test-phase that are ok
220  UInt_t fGDNTau; // number of tau-paths - calculated in SetGDTauPrec
221  Double_t fGDTauPrec; // precision in tau
222  UInt_t fGDTauScan; // number scan for tau-paths
223  Double_t fGDTauMin; // min threshold parameter (tau in eq 26, ref 1)
224  Double_t fGDTauMax; // max threshold parameter (tau in eq 26, ref 1)
225  Double_t fGDTau; // selected threshold parameter (tau in eq 26, ref 1)
226  Double_t fGDPathStep; // step size along path (delta nu in eq 22, ref 1)
227  Int_t fGDNPathSteps; // number of path steps
228  Double_t fGDErrScale; // stop scan at error = scale*errmin
229  //
230  Double_t fAverageTruth; // average truth, ie sum(y)/N, y=+-1
231  //
232  std::vector<Double_t> fFstar; // vector of F*() - filled in CalcFStar()
233  Double_t fFstarMedian; // median value of F*() using
234  //
235  TTree *fGDNtuple; // Gradient path ntuple, contains params for each step along the path
236  Double_t fNTRisk; // GD path: risk
237  Double_t fNTErrorRate; // GD path: error rate (or performance)
238  Double_t fNTNuval; // GD path: value of nu
239  Double_t fNTCoefRad; // GD path: 'radius' of all rulecoeffs
240  Double_t fNTOffset; // GD path: model offset
241  Double_t *fNTCoeff; // GD path: rule coefficients
242  Double_t *fNTLinCoeff; // GD path: linear coefficients
243 
244  Double_t fsigave; // Sigma of current signal score function F(sig)
245  Double_t fsigrms; // Rms of F(sig)
246  Double_t fbkgave; // Average of F(bkg)
247  Double_t fbkgrms; // Rms of F(bkg)
248 
249  private:
250 
251  mutable MsgLogger* fLogger; //! message logger
252  MsgLogger& Log() const { return *fLogger; }
253 
254  };
255 
256  // --------------------------------------------------------
257 
258  class AbsValue {
259 
260  public:
261 
262  Bool_t operator()( Double_t first, Double_t second ) const { return TMath::Abs(first) < TMath::Abs(second); }
263  };
264 }
265 
266 
267 #endif