54 ClassImp(TMVA::VariablePCATransform);
59 TMVA::VariablePCATransform::VariablePCATransform( DataSetInfo& dsi )
60 : VariableTransformBase( dsi, Types::kPCA,
"PCA" )
67 TMVA::VariablePCATransform::~VariablePCATransform()
69 for (UInt_t i=0; i<fMeanValues.size(); i++) {
70 if (fMeanValues.at(i) != 0)
delete fMeanValues.at(i);
71 if (fEigenVectors.at(i) != 0)
delete fEigenVectors.at(i);
81 void TMVA::VariablePCATransform::Initialize()
89 Bool_t TMVA::VariablePCATransform::PrepareTransformation (
const std::vector<Event*>& events)
93 if (!IsEnabled() || IsCreated())
return kTRUE;
95 Log() << kINFO <<
"Preparing the Principle Component (PCA) transformation..." << Endl;
97 UInt_t inputSize = fGet.size();
99 SetNVariables(inputSize);
102 if (inputSize <= 1) {
103 Log() << kFATAL <<
"Cannot perform PCA transformation for " << inputSize <<
" variable only" << Endl;
107 if (inputSize > 200) {
108 Log() << kINFO <<
"----------------------------------------------------------------------------"
111 <<
": More than 200 variables, will not calculate PCA!" << Endl;
112 Log() << kINFO <<
"----------------------------------------------------------------------------"
117 CalculatePrincipalComponents( events );
127 const TMVA::Event* TMVA::VariablePCATransform::Transform(
const Event*
const ev, Int_t cls )
const
129 if (!IsCreated())
return 0;
140 if (cls < 0 || cls >= (
int) fMeanValues.size()) cls = fMeanValues.size()-1;
145 if (fTransformedEvent==0 ) {
146 fTransformedEvent =
new Event();
149 std::vector<Float_t> input;
150 std::vector<Char_t> mask;
151 std::vector<Float_t> principalComponents;
153 Bool_t hasMaskedEntries = GetInput( ev, input, mask );
155 if( hasMaskedEntries ){
156 UInt_t numMasked = std::count(mask.begin(), mask.end(), (Char_t)kTRUE);
157 UInt_t numOK = std::count(mask.begin(), mask.end(), (Char_t)kFALSE);
158 if( numMasked>0 && numOK>0 ){
159 Log() << kFATAL <<
"You mixed variables and targets in the decorrelation transformation. This is not possible." << Endl;
161 SetOutput( fTransformedEvent, input, mask, ev );
162 return fTransformedEvent;
165 X2P( principalComponents, input, cls );
166 SetOutput( fTransformedEvent, principalComponents, mask, ev );
168 return fTransformedEvent;
176 const TMVA::Event* TMVA::VariablePCATransform::InverseTransform(
const Event*
const ev, Int_t cls )
const
178 if (!IsCreated())
return 0;
180 const UInt_t nCls = GetNClasses();
186 if (cls < 0 || UInt_t(cls) > nCls) cls = (fMeanValues.size()==1?0:2);
189 if (fBackTransformedEvent==0 ) fBackTransformedEvent =
new Event();
191 std::vector<Float_t> principalComponents;
192 std::vector<Char_t> mask;
193 std::vector<Float_t> output;
195 GetInput( ev, principalComponents, mask, kTRUE );
196 P2X( output, principalComponents, cls );
197 SetOutput( fBackTransformedEvent, output, mask, ev, kTRUE );
199 return fBackTransformedEvent;
206 void TMVA::VariablePCATransform::CalculatePrincipalComponents(
const std::vector< Event*>& events )
208 UInt_t nvars = 0, ntgts = 0, nspcts = 0;
209 CountVariableTypes( nvars, ntgts, nspcts );
210 if( nvars>0 && ntgts>0 )
211 Log() << kFATAL <<
"Variables and targets cannot be mixed in PCA transformation." << Endl;
213 const Int_t inputSize = fGet.size();
216 const UInt_t nCls = GetNClasses();
217 const UInt_t maxPCA = (nCls<=1) ? nCls : nCls+1;
220 std::vector<TPrincipal*> pca(maxPCA);
221 for (UInt_t i=0; i<maxPCA; i++) pca[i] =
new TPrincipal(nvars,
"");
226 Long64_t ievt, entries = events.size();
227 Double_t *dvec =
new Double_t[inputSize];
229 std::vector<Float_t> input;
230 std::vector<Char_t> mask;
231 for (ievt=0; ievt<entries; ievt++) {
232 const Event* ev = events[ievt];
233 UInt_t cls = ev->GetClass();
235 Bool_t hasMaskedEntries = GetInput( ev, input, mask );
236 if (hasMaskedEntries){
237 Log() << kWARNING <<
"Print event which triggers an error" << Endl;
238 std::ostringstream oss;
241 Log() << kFATAL <<
"Masked entries found in event read in when calculating the principal components for the PCA transformation." << Endl;
245 for( std::vector<Float_t>::iterator itInp = input.begin(), itInpEnd = input.end(); itInp != itInpEnd; ++itInp )
247 Float_t value = (*itInp);
248 dvec[iinp] = (Double_t)value;
252 pca.at(cls)->AddRow( dvec );
253 if (nCls > 1) pca.at(maxPCA-1)->AddRow( dvec );
257 for (UInt_t i=0; i<fMeanValues.size(); i++)
if (fMeanValues[i] != 0)
delete fMeanValues[i];
258 for (UInt_t i=0; i<fEigenVectors.size(); i++)
if (fEigenVectors[i] != 0)
delete fEigenVectors[i];
259 fMeanValues.resize(maxPCA,0);
260 fEigenVectors.resize(maxPCA,0);
262 for (UInt_t i=0; i<maxPCA; i++ ) {
263 pca.at(i)->MakePrincipals();
266 fMeanValues[i] =
new TVectorD( *(pca.at(i)->GetMeanValues()) );
267 fEigenVectors[i] =
new TMatrixD( *(pca.at(i)->GetEigenVectors()) );
270 for (UInt_t i=0; i<maxPCA; i++)
delete pca.at(i);
280 void TMVA::VariablePCATransform::X2P( std::vector<Float_t>& pc,
const std::vector<Float_t>& x, Int_t cls )
const
282 const Int_t nInput = x.size();
285 for (Int_t i = 0; i < nInput; i++) {
287 for (Int_t j = 0; j < nInput; j++)
288 pv += (((Double_t)x.at(j)) - (*fMeanValues.at(cls))(j)) * (*fEigenVectors.at(cls))(j,i);
299 void TMVA::VariablePCATransform::P2X( std::vector<Float_t>& x,
const std::vector<Float_t>& pc, Int_t cls )
const
301 const Int_t nInput = pc.size();
304 for (Int_t i = 0; i < nInput; i++) {
306 for (Int_t j = 0; j < nInput; j++)
307 xv += (((Double_t)pc.at(j)) * (*fEigenVectors.at(cls))(i,j) ) + (*fMeanValues.at(cls))(j);
315 void TMVA::VariablePCATransform::WriteTransformationToStream( std::ostream& o )
const
317 for (Int_t sbType=0; sbType<2; sbType++) {
318 o <<
"# PCA mean values " << std::endl;
319 const TVectorD* means = fMeanValues[sbType];
320 o << (sbType==0 ?
"Signal" :
"Background") <<
" " << means->GetNrows() << std::endl;
321 for (Int_t row = 0; row<means->GetNrows(); row++) {
322 o << std::setprecision(12) << std::setw(20) << (*means)[row];
326 o <<
"##" << std::endl;
329 for (Int_t sbType=0; sbType<2; sbType++) {
330 o <<
"# PCA eigenvectors " << std::endl;
331 const TMatrixD* mat = fEigenVectors[sbType];
332 o << (sbType==0 ?
"Signal" :
"Background") <<
" " << mat->GetNrows() <<
" x " << mat->GetNcols() << std::endl;
333 for (Int_t row = 0; row<mat->GetNrows(); row++) {
334 for (Int_t col = 0; col<mat->GetNcols(); col++) {
335 o << std::setprecision(12) << std::setw(20) << (*mat)[row][col] <<
" ";
340 o <<
"##" << std::endl;
346 void TMVA::VariablePCATransform::AttachXMLTo(
void* parent) {
347 void* trfxml = gTools().AddChild(parent,
"Transform");
348 gTools().AddAttr(trfxml,
"Name",
"PCA");
350 VariableTransformBase::AttachXMLTo( trfxml );
353 for (UInt_t sbType=0; sbType<fMeanValues.size(); sbType++) {
354 void* meanxml = gTools().AddChild( trfxml,
"Statistics");
355 const TVectorD* means = fMeanValues[sbType];
356 gTools().AddAttr( meanxml,
"Class", (sbType==0 ?
"Signal" :(sbType==1 ?
"Background":
"Combined")) );
357 gTools().AddAttr( meanxml,
"ClassIndex", sbType );
358 gTools().AddAttr( meanxml,
"NRows", means->GetNrows() );
359 TString meansdef =
"";
360 for (Int_t row = 0; row<means->GetNrows(); row++)
361 meansdef += gTools().StringFromDouble((*means)[row]) +
" ";
362 gTools().AddRawLine( meanxml, meansdef );
366 for (UInt_t sbType=0; sbType<fEigenVectors.size(); sbType++) {
367 void* evxml = gTools().AddChild( trfxml,
"Eigenvectors");
368 const TMatrixD* mat = fEigenVectors[sbType];
369 gTools().AddAttr( evxml,
"Class", (sbType==0 ?
"Signal" :(sbType==1 ?
"Background":
"Combined") ) );
370 gTools().AddAttr( evxml,
"ClassIndex", sbType );
371 gTools().AddAttr( evxml,
"NRows", mat->GetNrows() );
372 gTools().AddAttr( evxml,
"NCols", mat->GetNcols() );
374 for (Int_t row = 0; row<mat->GetNrows(); row++)
375 for (Int_t col = 0; col<mat->GetNcols(); col++)
376 evdef += gTools().StringFromDouble((*mat)[row][col]) +
" ";
377 gTools().AddRawLine( evxml, evdef );
384 void TMVA::VariablePCATransform::ReadFromXML(
void* trfnode )
391 Bool_t newFormat = kFALSE;
393 void* inpnode = NULL;
395 inpnode = gTools().GetChild(trfnode,
"Selection");
402 VariableTransformBase::ReadFromXML( inpnode );
406 void* ch = gTools().GetChild(trfnode);
408 nodeName = gTools().GetName(ch);
409 if (nodeName ==
"Statistics") {
411 gTools().ReadAttr(ch,
"Class", classtype);
412 gTools().ReadAttr(ch,
"ClassIndex", clsIdx);
413 gTools().ReadAttr(ch,
"NRows", nrows);
416 if (fMeanValues.size()<=clsIdx) fMeanValues.resize(clsIdx+1,0);
417 if (fMeanValues[clsIdx]==0) fMeanValues[clsIdx] =
new TVectorD( nrows );
418 fMeanValues[clsIdx]->ResizeTo( nrows );
421 std::stringstream s(gTools().GetContent(ch));
422 for (Int_t row = 0; row<nrows; row++) s >> (*fMeanValues[clsIdx])(row);
424 else if ( nodeName ==
"Eigenvectors" ) {
426 gTools().ReadAttr(ch,
"Class", classtype);
427 gTools().ReadAttr(ch,
"ClassIndex", clsIdx);
428 gTools().ReadAttr(ch,
"NRows", nrows);
429 gTools().ReadAttr(ch,
"NCols", ncols);
431 if (fEigenVectors.size()<=clsIdx) fEigenVectors.resize(clsIdx+1,0);
432 if (fEigenVectors[clsIdx]==0) fEigenVectors[clsIdx] =
new TMatrixD( nrows, ncols );
433 fEigenVectors[clsIdx]->ResizeTo( nrows, ncols );
436 std::stringstream s(gTools().GetContent(ch));
437 for (Int_t row = 0; row<nrows; row++)
438 for (Int_t col = 0; col<ncols; col++)
439 s >> (*fEigenVectors[clsIdx])[row][col];
441 ch = gTools().GetNextChild(ch);
450 void TMVA::VariablePCATransform::ReadTransformationFromStream( std::istream& istr,
const TString& classname )
453 istr.getline(buf,512);
454 TString strvar, dummy;
455 Int_t nrows(0), ncols(0);
456 UInt_t classIdx=(classname==
"signal"?0:1);
458 for (UInt_t i=0; i<fMeanValues.size(); i++) {
459 if (fMeanValues.at(i) != 0)
delete fMeanValues.at(i);
460 if (fEigenVectors.at(i) != 0)
delete fEigenVectors.at(i);
462 fMeanValues.resize(3);
463 fEigenVectors.resize(3);
465 Log() << kINFO <<
"VariablePCATransform::ReadTransformationFromStream(): " << Endl;
467 while (!(buf[0]==
'#'&& buf[1]==
'#')) {
469 while (*p==
' ' || *p==
'\t') p++;
470 if (*p==
'#' || *p==
'\0') {
471 istr.getline(buf,512);
474 std::stringstream sstr(buf);
476 if (strvar==
"signal" || strvar==
"background") {
479 Int_t sbType = (strvar==
"signal" ? 0 : 1);
481 if (fMeanValues[sbType] == 0) fMeanValues[sbType] =
new TVectorD( nrows );
482 else fMeanValues[sbType]->ResizeTo( nrows );
485 for (Int_t row = 0; row<nrows; row++) istr >> (*fMeanValues[sbType])(row);
489 istr.getline(buf,512);
493 istr.getline(buf,512);
494 while (!(buf[0]==
'#'&& buf[1]==
'#')) {
496 while(*p==
' ' || *p==
'\t') p++;
497 if (*p==
'#' || *p==
'\0') {
498 istr.getline(buf,512);
501 std::stringstream sstr(buf);
503 if (strvar==
"signal" || strvar==
"background") {
506 sstr >> nrows >> dummy >> ncols;
507 Int_t sbType = (strvar==
"signal" ? 0 : 1);
509 if (fEigenVectors[sbType] == 0) fEigenVectors[sbType] =
new TMatrixD( nrows, ncols );
510 else fEigenVectors[sbType]->ResizeTo( nrows, ncols );
513 for (Int_t row = 0; row<fEigenVectors[sbType]->GetNrows(); row++) {
514 for (Int_t col = 0; col<fEigenVectors[sbType]->GetNcols(); col++) {
515 istr >> (*fEigenVectors[sbType])[row][col];
520 istr.getline(buf,512);
522 fMeanValues[2] =
new TVectorD( *fMeanValues[classIdx] );
523 fEigenVectors[2] =
new TMatrixD( *fEigenVectors[classIdx] );
531 void TMVA::VariablePCATransform::MakeFunction( std::ostream& fout,
const TString& fcncName,
532 Int_t part, UInt_t trCounter, Int_t )
534 UInt_t nvar = fEigenVectors[0]->GetNrows();
537 UInt_t numC = fMeanValues.size();
540 fout <<
" void X2P_"<<trCounter<<
"( const double*, double*, int ) const;" << std::endl;
541 fout <<
" double fMeanValues_"<<trCounter<<
"["<<numC<<
"]["
542 << fMeanValues[0]->GetNrows() <<
"];" << std::endl;
543 fout <<
" double fEigenVectors_"<<trCounter<<
"["<<numC<<
"]["
544 << fEigenVectors[0]->GetNrows() <<
"]["
545 << fEigenVectors[0]->GetNcols() <<
"];" << std::endl;
551 if (fMeanValues[0]->GetNrows() != fMeanValues[1]->GetNrows() ||
552 fEigenVectors[0]->GetNrows() != fEigenVectors[1]->GetNrows() ||
553 fEigenVectors[0]->GetNcols() != fEigenVectors[1]->GetNcols()) {
554 Log() << kFATAL <<
"<MakeFunction> Mismatch in vector/matrix dimensions" << Endl;
561 fout <<
"//_______________________________________________________________________" << std::endl;
562 fout <<
"inline void " << fcncName <<
"::X2P_"<<trCounter<<
"( const double* x, double* p, int index ) const" << std::endl;
563 fout <<
"{" << std::endl;
564 fout <<
" // Calculate the principal components from the original data vector" << std::endl;
565 fout <<
" // x, and return it in p (function extracted from TPrincipal::X2P)" << std::endl;
566 fout <<
" // It's the users responsibility to make sure that both x and p are" << std::endl;
567 fout <<
" // of the right size (i.e., memory must be allocated for p)." << std::endl;
568 fout <<
" const int nVar = " << nvar <<
";" << std::endl;
570 fout <<
" for (int i = 0; i < nVar; i++) {" << std::endl;
571 fout <<
" p[i] = 0;" << std::endl;
572 fout <<
" for (int j = 0; j < nVar; j++) p[i] += (x[j] - fMeanValues_"<<trCounter<<
"[index][j]) * fEigenVectors_"<<trCounter<<
"[index][j][i];" << std::endl;
573 fout <<
" }" << std::endl;
574 fout <<
"}" << std::endl;
576 fout <<
"//_______________________________________________________________________" << std::endl;
577 fout <<
"inline void " << fcncName <<
"::InitTransform_"<<trCounter<<
"()" << std::endl;
578 fout <<
"{" << std::endl;
579 fout <<
" // PCA transformation, initialisation" << std::endl;
582 fout <<
" // initialise vector of mean values" << std::endl;
583 std::streamsize dp = fout.precision();
584 for (UInt_t index=0; index<numC; index++) {
585 for (
int i=0; i<fMeanValues[index]->GetNrows(); i++) {
586 fout <<
" fMeanValues_"<<trCounter<<
"["<<index<<
"]["<<i<<
"] = " << std::setprecision(12)
587 << (*fMeanValues[index])(i) <<
";" << std::endl;
593 fout <<
" // initialise matrix of eigenvectors" << std::endl;
594 for (UInt_t index=0; index<numC; index++) {
595 for (
int i=0; i<fEigenVectors[index]->GetNrows(); i++) {
596 for (
int j=0; j<fEigenVectors[index]->GetNcols(); j++) {
597 fout <<
" fEigenVectors_"<<trCounter<<
"["<<index<<
"]["<<i<<
"]["<<j<<
"] = " << std::setprecision(12)
598 << (*fEigenVectors[index])(i,j) <<
";" << std::endl;
602 fout << std::setprecision(dp);
603 fout <<
"}" << std::endl;
605 fout <<
"//_______________________________________________________________________" << std::endl;
606 fout <<
"inline void " << fcncName <<
"::Transform_"<<trCounter<<
"( std::vector<double>& iv, int cls ) const" << std::endl;
607 fout <<
"{" << std::endl;
608 fout <<
" // PCA transformation" << std::endl;
609 fout <<
" const int nVar = " << nvar <<
";" << std::endl;
610 fout <<
" double *dv = new double[nVar];" << std::endl;
611 fout <<
" double *rv = new double[nVar];" << std::endl;
612 fout <<
" if (cls < 0 || cls > "<<GetNClasses()<<
") {"<< std::endl;
613 fout <<
" if ("<<GetNClasses()<<
" > 1 ) cls = "<<GetNClasses()<<
";"<< std::endl;
614 fout <<
" else cls = "<<(numC==1?0:2)<<
";"<< std::endl;
615 fout <<
" }"<< std::endl;
617 VariableTransformBase::MakeFunction(fout, fcncName, 0, trCounter, 0 );
619 fout <<
" for (int ivar=0; ivar<nVar; ivar++) dv[ivar] = iv[indicesGet.at(ivar)];" << std::endl;
622 fout <<
" // Perform PCA and put it into PCAed events tree" << std::endl;
623 fout <<
" this->X2P_"<<trCounter<<
"( dv, rv, cls );" << std::endl;
624 fout <<
" for (int ivar=0; ivar<nVar; ivar++) iv[indicesPut.at(ivar)] = rv[ivar];" << std::endl;
627 fout <<
" delete [] dv;" << std::endl;
628 fout <<
" delete [] rv;" << std::endl;
629 fout <<
"}" << std::endl;