Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RooKeysPdf.cxx
Go to the documentation of this file.
1 /*****************************************************************************
2  * Project: RooFit *
3  * Package: RooFitModels *
4  * @(#)root/roofit:$Id$
5  * Authors: *
6  * GR, Gerhard Raven, UC San Diego, raven@slac.stanford.edu *
7  * DK, David Kirkby, UC Irvine, dkirkby@uci.edu *
8  * WV, Wouter Verkerke, UC Santa Barbara, verkerke@slac.stanford.edu *
9  * *
10  * Copyright (c) 2000-2005, Regents of the University of California *
11  * and Stanford University. All rights reserved. *
12  * *
13  * Redistribution and use in source and binary forms, *
14  * with or without modification, are permitted according to the terms *
15  * listed in LICENSE (http://roofit.sourceforge.net/license.txt) *
16  *****************************************************************************/
17 
18 /** \class RooKeysPdf
19  \ingroup Roofit
20 
21 Class RooKeysPdf implements a one-dimensional kernel estimation p.d.f which model the distribution
22 of an arbitrary input dataset as a superposition of Gaussian kernels, one for each data point,
23 each contributing 1/N to the total integral of the p.d.f..
24 If the 'adaptive mode' is enabled, the width of the Gaussian is adaptively calculated from the
25 local density of events, i.e. narrow for regions with high event density to preserve details and
26 wide for regions with low event density to promote smoothness. The details of the general algorithm
27 are described in the following paper:
28 
29 Cranmer KS, Kernel Estimation in High-Energy Physics.
30  Computer Physics Communications 136:198-207,2001 - e-Print Archive: hep ex/0011057
31 **/
32 
33 #include "RooFit.h"
34 
35 #include <limits>
36 #include <algorithm>
37 #include <cmath>
38 #include "Riostream.h"
39 #include "TMath.h"
40 
41 #include "RooKeysPdf.h"
42 #include "RooAbsReal.h"
43 #include "RooRealVar.h"
44 #include "RooRandom.h"
45 #include "RooDataSet.h"
46 #include "RooTrace.h"
47 
48 #include "TError.h"
49 
50 using namespace std;
51 
52 ClassImp(RooKeysPdf);
53 
54 const Double_t RooKeysPdf::_nSigma = std::sqrt(-2. *
55  std::log(std::numeric_limits<Double_t>::epsilon()));
56 
57 ////////////////////////////////////////////////////////////////////////////////
58 /// coverity[UNINIT_CTOR]
59 
60  RooKeysPdf::RooKeysPdf() : _nEvents(0), _dataPts(0), _dataWgts(0), _weights(0), _sumWgt(0),
61  _mirrorLeft(kFALSE), _mirrorRight(kFALSE),
62  _asymLeft(kFALSE), _asymRight(kFALSE)
63 {
64  TRACE_CREATE
65 }
66 
67 ////////////////////////////////////////////////////////////////////////////////
68 /// cache stuff about x
69 
70 RooKeysPdf::RooKeysPdf(const char *name, const char *title,
71  RooAbsReal& x, RooDataSet& data,
72  Mirror mirror, Double_t rho) :
73  RooAbsPdf(name,title),
74  _x("x","observable",this,x),
75  _nEvents(0),
76  _dataPts(0),
77  _dataWgts(0),
78  _weights(0),
79  _mirrorLeft(mirror==MirrorLeft || mirror==MirrorBoth || mirror==MirrorLeftAsymRight),
80  _mirrorRight(mirror==MirrorRight || mirror==MirrorBoth || mirror==MirrorAsymLeftRight),
81  _asymLeft(mirror==MirrorAsymLeft || mirror==MirrorAsymLeftRight || mirror==MirrorAsymBoth),
82  _asymRight(mirror==MirrorAsymRight || mirror==MirrorLeftAsymRight || mirror==MirrorAsymBoth),
83  _rho(rho)
84 {
85  snprintf(_varName, 128,"%s", x.GetName());
86  RooAbsRealLValue& real= (RooRealVar&)(_x.arg());
87  _lo = real.getMin();
88  _hi = real.getMax();
89  _binWidth = (_hi-_lo)/(_nPoints-1);
90 
91  // form the lookup table
92  LoadDataSet(data);
93  TRACE_CREATE
94 }
95 
96 ////////////////////////////////////////////////////////////////////////////////
97 /// cache stuff about x
98 
99 RooKeysPdf::RooKeysPdf(const char *name, const char *title,
100  RooAbsReal& xpdf, RooRealVar& xdata, RooDataSet& data,
101  Mirror mirror, Double_t rho) :
102  RooAbsPdf(name,title),
103  _x("x","Observable",this,xpdf),
104  _nEvents(0),
105  _dataPts(0),
106  _dataWgts(0),
107  _weights(0),
108  _mirrorLeft(mirror==MirrorLeft || mirror==MirrorBoth || mirror==MirrorLeftAsymRight),
109  _mirrorRight(mirror==MirrorRight || mirror==MirrorBoth || mirror==MirrorAsymLeftRight),
110  _asymLeft(mirror==MirrorAsymLeft || mirror==MirrorAsymLeftRight || mirror==MirrorAsymBoth),
111  _asymRight(mirror==MirrorAsymRight || mirror==MirrorLeftAsymRight || mirror==MirrorAsymBoth),
112  _rho(rho)
113 {
114  snprintf(_varName, 128,"%s", xdata.GetName());
115  RooAbsRealLValue& real= (RooRealVar&)(xdata);
116  _lo = real.getMin();
117  _hi = real.getMax();
118  _binWidth = (_hi-_lo)/(_nPoints-1);
119 
120  // form the lookup table
121  LoadDataSet(data);
122  TRACE_CREATE
123 }
124 
125 ////////////////////////////////////////////////////////////////////////////////
126 
127 RooKeysPdf::RooKeysPdf(const RooKeysPdf& other, const char* name):
128  RooAbsPdf(other,name), _x("x",this,other._x), _nEvents(other._nEvents),
129  _dataPts(0), _dataWgts(0), _weights(0), _sumWgt(0),
130  _mirrorLeft( other._mirrorLeft ), _mirrorRight( other._mirrorRight ),
131  _asymLeft(other._asymLeft), _asymRight(other._asymRight),
132  _rho( other._rho ) {
133  // cache stuff about x
134  snprintf(_varName, 128, "%s", other._varName );
135  _lo = other._lo;
136  _hi = other._hi;
137  _binWidth = other._binWidth;
138 
139  // copy over data and weights... not necessary, commented out for speed
140 // _dataPts = new Double_t[_nEvents];
141 // _weights = new Double_t[_nEvents];
142 // for (Int_t i= 0; i<_nEvents; i++) {
143 // _dataPts[i]= other._dataPts[i];
144 // _weights[i]= other._weights[i];
145 // }
146 
147  // copy over the lookup table
148  for (Int_t i= 0; i<_nPoints+1; i++)
149  _lookupTable[i]= other._lookupTable[i];
150 
151  TRACE_CREATE
152 }
153 
154 ////////////////////////////////////////////////////////////////////////////////
155 
156 RooKeysPdf::~RooKeysPdf() {
157  delete[] _dataPts;
158  delete[] _dataWgts;
159  delete[] _weights;
160 
161  TRACE_DESTROY
162 }
163 
164 ////////////////////////////////////////////////////////////////////////////////
165 /// small helper structure
166 
167 namespace {
168  struct Data {
169  Double_t x;
170  Double_t w;
171  };
172  // helper to order two Data structures
173  struct cmp {
174  inline bool operator()(const struct Data& a, const struct Data& b) const
175  { return a.x < b.x; }
176  };
177 }
178 void RooKeysPdf::LoadDataSet( RooDataSet& data) {
179  delete[] _dataPts;
180  delete[] _dataWgts;
181  delete[] _weights;
182 
183  std::vector<Data> tmp;
184  tmp.reserve((1 + _mirrorLeft + _mirrorRight) * data.numEntries());
185  Double_t x0 = 0., x1 = 0., x2 = 0.;
186  _sumWgt = 0.;
187  // read the data set into tmp and accumulate some statistics
188  RooRealVar& real = (RooRealVar&)(data.get()->operator[](_varName));
189  for (Int_t i = 0; i < data.numEntries(); ++i) {
190  data.get(i);
191  const Double_t x = real.getVal();
192  const Double_t w = data.weight();
193  x0 += w;
194  x1 += w * x;
195  x2 += w * x * x;
196  _sumWgt += Double_t(1 + _mirrorLeft + _mirrorRight) * w;
197 
198  Data p;
199  p.x = x, p.w = w;
200  tmp.push_back(p);
201  if (_mirrorLeft) {
202  p.x = 2. * _lo - x;
203  tmp.push_back(p);
204  }
205  if (_mirrorRight) {
206  p.x = 2. * _hi - x;
207  tmp.push_back(p);
208  }
209  }
210  // sort the entire data set so that values of x are increasing
211  std::sort(tmp.begin(), tmp.end(), cmp());
212 
213  // copy the sorted data set to its final destination
214  _nEvents = tmp.size();
215  _dataPts = new Double_t[_nEvents];
216  _dataWgts = new Double_t[_nEvents];
217  for (unsigned i = 0; i < tmp.size(); ++i) {
218  _dataPts[i] = tmp[i].x;
219  _dataWgts[i] = tmp[i].w;
220  }
221  {
222  // free tmp
223  std::vector<Data> tmp2;
224  tmp2.swap(tmp);
225  }
226 
227  Double_t meanv=x1/x0;
228  Double_t sigmav=std::sqrt(x2/x0-meanv*meanv);
229  Double_t h=std::pow(Double_t(4)/Double_t(3),0.2)*std::pow(_nEvents,-0.2)*_rho;
230  Double_t hmin=h*sigmav*std::sqrt(2.)/10;
231  Double_t norm=h*std::sqrt(sigmav)/(2.0*std::sqrt(3.0));
232 
233  _weights=new Double_t[_nEvents];
234  for(Int_t j=0;j<_nEvents;++j) {
235  _weights[j]=norm/std::sqrt(g(_dataPts[j],h*sigmav));
236  if (_weights[j]<hmin) _weights[j]=hmin;
237  }
238 
239  // The idea below is that beyond nSigma sigma, the value of the exponential
240  // in the Gaussian is well below the machine precision of a double, so it
241  // does not contribute any more. That way, we can limit how many bins of the
242  // binned approximation in _lookupTable we have to touch when filling it.
243  for (Int_t i=0;i<_nPoints+1;++i) _lookupTable[i] = 0.;
244  for(Int_t j=0;j<_nEvents;++j) {
245  const Double_t xlo = std::min(_hi,
246  std::max(_lo, _dataPts[j] - _nSigma * _weights[j]));
247  const Double_t xhi = std::max(_lo,
248  std::min(_hi, _dataPts[j] + _nSigma * _weights[j]));
249  if (xlo >= xhi) continue;
250  const Double_t chi2incr = _binWidth / _weights[j] / std::sqrt(2.);
251  const Double_t weightratio = _dataWgts[j] / _weights[j];
252  const Int_t binlo = static_cast<Int_t>(std::floor((xlo - _lo) / _binWidth));
253  const Int_t binhi = static_cast<Int_t>(_nPoints - std::floor((_hi - xhi) / _binWidth));
254  const Double_t x = (Double_t(_nPoints - binlo) * _lo +
255  Double_t(binlo) * _hi) / Double_t(_nPoints);
256  Double_t chi = (x - _dataPts[j]) / _weights[j] / std::sqrt(2.);
257  for (Int_t k = binlo; k <= binhi; ++k, chi += chi2incr) {
258  _lookupTable[k] += weightratio * std::exp(- chi * chi);
259  }
260  }
261  if (_asymLeft) {
262  for(Int_t j=0;j<_nEvents;++j) {
263  const Double_t xlo = std::min(_hi,
264  std::max(_lo, 2. * _lo - _dataPts[j] + _nSigma * _weights[j]));
265  const Double_t xhi = std::max(_lo,
266  std::min(_hi, 2. * _lo - _dataPts[j] - _nSigma * _weights[j]));
267  if (xlo >= xhi) continue;
268  const Double_t chi2incr = _binWidth / _weights[j] / std::sqrt(2.);
269  const Double_t weightratio = _dataWgts[j] / _weights[j];
270  const Int_t binlo = static_cast<Int_t>(std::floor((xlo - _lo) / _binWidth));
271  const Int_t binhi = static_cast<Int_t>(_nPoints - std::floor((_hi - xhi) / _binWidth));
272  const Double_t x = (Double_t(_nPoints - binlo) * _lo +
273  Double_t(binlo) * _hi) / Double_t(_nPoints);
274  Double_t chi = (x - (2. * _lo - _dataPts[j])) / _weights[j] / std::sqrt(2.);
275  for (Int_t k = binlo; k <= binhi; ++k, chi += chi2incr) {
276  _lookupTable[k] -= weightratio * std::exp(- chi * chi);
277  }
278  }
279  }
280  if (_asymRight) {
281  for(Int_t j=0;j<_nEvents;++j) {
282  const Double_t xlo = std::min(_hi,
283  std::max(_lo, 2. * _hi - _dataPts[j] + _nSigma * _weights[j]));
284  const Double_t xhi = std::max(_lo,
285  std::min(_hi, 2. * _hi - _dataPts[j] - _nSigma * _weights[j]));
286  if (xlo >= xhi) continue;
287  const Double_t chi2incr = _binWidth / _weights[j] / std::sqrt(2.);
288  const Double_t weightratio = _dataWgts[j] / _weights[j];
289  const Int_t binlo = static_cast<Int_t>(std::floor((xlo - _lo) / _binWidth));
290  const Int_t binhi = static_cast<Int_t>(_nPoints - std::floor((_hi - xhi) / _binWidth));
291  const Double_t x = (Double_t(_nPoints - binlo) * _lo +
292  Double_t(binlo) * _hi) / Double_t(_nPoints);
293  Double_t chi = (x - (2. * _hi - _dataPts[j])) / _weights[j] / std::sqrt(2.);
294  for (Int_t k = binlo; k <= binhi; ++k, chi += chi2incr) {
295  _lookupTable[k] -= weightratio * std::exp(- chi * chi);
296  }
297  }
298  }
299  static const Double_t sqrt2pi(std::sqrt(2*TMath::Pi()));
300  for (Int_t i=0;i<_nPoints+1;++i)
301  _lookupTable[i] /= sqrt2pi * _sumWgt;
302 }
303 
304 ////////////////////////////////////////////////////////////////////////////////
305 
306 Double_t RooKeysPdf::evaluate() const {
307  Int_t i = (Int_t)floor((Double_t(_x)-_lo)/_binWidth);
308  if (i<0) {
309 // cerr << "got point below lower bound:"
310 // << Double_t(_x) << " < " << _lo
311 // << " -- performing linear extrapolation..." << endl;
312  i=0;
313  }
314  if (i>_nPoints-1) {
315 // cerr << "got point above upper bound:"
316 // << Double_t(_x) << " > " << _hi
317 // << " -- performing linear extrapolation..." << endl;
318  i=_nPoints-1;
319  }
320  Double_t dx = (Double_t(_x)-(_lo+i*_binWidth))/_binWidth;
321 
322  // for now do simple linear interpolation.
323  // one day replace by splines...
324  Double_t ret = (_lookupTable[i]+dx*(_lookupTable[i+1]-_lookupTable[i]));
325  if (ret<0) ret=0 ;
326  return ret ;
327 }
328 
329 Int_t RooKeysPdf::getAnalyticalIntegral(
330  RooArgSet& allVars, RooArgSet& analVars, const char* /* rangeName */) const
331 {
332  if (matchArgs(allVars, analVars, _x)) return 1;
333  return 0;
334 }
335 
336 Double_t RooKeysPdf::analyticalIntegral(Int_t code, const char* rangeName) const
337 {
338  R__ASSERT(1 == code);
339  // this code is based on _lookupTable and uses linear interpolation, just as
340  // evaluate(); integration is done using the trapez rule
341  const Double_t xmin = std::max(_lo, _x.min(rangeName));
342  const Double_t xmax = std::min(_hi, _x.max(rangeName));
343  const Int_t imin = (Int_t)floor((xmin - _lo) / _binWidth);
344  const Int_t imax = std::min((Int_t)floor((xmax - _lo) / _binWidth),
345  _nPoints - 1);
346  Double_t sum = 0.;
347  // sum up complete bins in middle
348  if (imin + 1 < imax)
349  sum += _lookupTable[imin + 1] + _lookupTable[imax];
350  for (Int_t i = imin + 2; i < imax; ++i)
351  sum += 2. * _lookupTable[i];
352  sum *= _binWidth * 0.5;
353  // treat incomplete bins
354  const Double_t dxmin = (xmin - (_lo + imin * _binWidth)) / _binWidth;
355  const Double_t dxmax = (xmax - (_lo + imax * _binWidth)) / _binWidth;
356  if (imin < imax) {
357  // first bin
358  sum += _binWidth * (1. - dxmin) * 0.5 * (_lookupTable[imin + 1] +
359  _lookupTable[imin] + dxmin *
360  (_lookupTable[imin + 1] - _lookupTable[imin]));
361  // last bin
362  sum += _binWidth * dxmax * 0.5 * (_lookupTable[imax] +
363  _lookupTable[imax] + dxmax *
364  (_lookupTable[imax + 1] - _lookupTable[imax]));
365  } else if (imin == imax) {
366  // first bin == last bin
367  sum += _binWidth * (dxmax - dxmin) * 0.5 * (
368  _lookupTable[imin] + dxmin *
369  (_lookupTable[imin + 1] - _lookupTable[imin]) +
370  _lookupTable[imax] + dxmax *
371  (_lookupTable[imax + 1] - _lookupTable[imax]));
372  }
373  return sum;
374 }
375 
376 Int_t RooKeysPdf::getMaxVal(const RooArgSet& vars) const
377 {
378  if (vars.contains(*_x.absArg())) return 1;
379  return 0;
380 }
381 
382 Double_t RooKeysPdf::maxVal(Int_t code) const
383 {
384  R__ASSERT(1 == code);
385  Double_t max = -std::numeric_limits<Double_t>::max();
386  for (Int_t i = 0; i <= _nPoints; ++i)
387  if (max < _lookupTable[i]) max = _lookupTable[i];
388  return max;
389 }
390 
391 ////////////////////////////////////////////////////////////////////////////////
392 
393 Double_t RooKeysPdf::g(Double_t x,Double_t sigmav) const {
394  Double_t y=0;
395  // since data is sorted, we can be a little faster because we know which data
396  // points contribute
397  Double_t* it = std::lower_bound(_dataPts, _dataPts + _nEvents,
398  x - _nSigma * sigmav);
399  if (it >= (_dataPts + _nEvents)) return 0.;
400  Double_t* iend = std::upper_bound(it, _dataPts + _nEvents,
401  x + _nSigma * sigmav);
402  for ( ; it < iend; ++it) {
403  const Double_t r = (x - *it) / sigmav;
404  y += std::exp(-0.5 * r * r);
405  }
406 
407  static const Double_t sqrt2pi(std::sqrt(2*TMath::Pi()));
408  return y/(sigmav*sqrt2pi*_nEvents);
409 }