64 ClassImp(TMVA::MethodDNN);
73 TMVA::MethodDNN::MethodDNN(
const TString &jobName,
const TString &methodTitle, DataSetInfo &theData,
74 const TString &theOption)
75 : MethodBase(jobName, Types::kDNN, methodTitle, theData, theOption), fWeightInitialization(), fOutputFunction(),
76 fLayoutString(), fErrorStrategy(), fTrainingStrategyString(), fWeightInitializationString(),
77 fArchitectureString(), fTrainingSettings(), fResume(false), fSettings()
84 TMVA::MethodDNN::MethodDNN(DataSetInfo& theData,
85 const TString& theWeightFile)
86 : MethodBase( Types::kDNN, theData, theWeightFile),
87 fWeightInitialization(), fOutputFunction(), fLayoutString(), fErrorStrategy(),
88 fTrainingStrategyString(), fWeightInitializationString(), fArchitectureString(),
89 fTrainingSettings(), fResume(false), fSettings()
91 fWeightInitialization = DNN::EInitialization::kGauss;
92 fOutputFunction = DNN::EOutputFunction::kSigmoid;
98 TMVA::MethodDNN::~MethodDNN()
100 fWeightInitialization = DNN::EInitialization::kGauss;
101 fOutputFunction = DNN::EOutputFunction::kSigmoid;
108 Bool_t TMVA::MethodDNN::HasAnalysisType(Types::EAnalysisType type,
109 UInt_t numberClasses,
112 if (type == Types::kClassification && numberClasses == 2 )
return kTRUE;
113 if (type == Types::kMulticlass )
return kTRUE;
114 if (type == Types::kRegression )
return kTRUE;
122 void TMVA::MethodDNN::Init() {}
137 void TMVA::MethodDNN::DeclareOptions()
140 DeclareOptionRef(fLayoutString=
"SOFTSIGN|(N+100)*2,LINEAR",
142 "Layout of the network.");
144 DeclareOptionRef(fValidationSize =
"20%",
"ValidationSize",
145 "Part of the training data to use for "
146 "validation. Specify as 0.2 or 20% to use a "
147 "fifth of the data set as validation set. "
148 "Specify as 100 to use exactly 100 events. "
151 DeclareOptionRef(fErrorStrategy=
"CROSSENTROPY",
153 "Loss function: Mean squared error (regression)"
154 " or cross entropy (binary classification).");
155 AddPreDefVal(TString(
"CROSSENTROPY"));
156 AddPreDefVal(TString(
"SUMOFSQUARES"));
157 AddPreDefVal(TString(
"MUTUALEXCLUSIVE"));
159 DeclareOptionRef(fWeightInitializationString=
"XAVIER",
160 "WeightInitialization",
161 "Weight initialization strategy");
162 AddPreDefVal(TString(
"XAVIER"));
163 AddPreDefVal(TString(
"XAVIERUNIFORM"));
165 DeclareOptionRef(fArchitectureString =
"CPU",
"Architecture",
"Which architecture to perform the training on.");
166 AddPreDefVal(TString(
"STANDARD"));
167 AddPreDefVal(TString(
"CPU"));
168 AddPreDefVal(TString(
"GPU"));
169 AddPreDefVal(TString(
"OPENCL"));
172 fTrainingStrategyString =
"LearningRate=1e-1,"
175 "ConvergenceSteps=50,"
181 "DropRepetitions=5|LearningRate=1e-4,"
184 "ConvergenceSteps=50,"
189 "DropConfig=0.0+0.5+0.5,"
191 "Multithreading=True",
193 "Defines the training strategies.");
200 auto TMVA::MethodDNN::ParseLayoutString(TString layoutString)
203 LayoutVector_t layout;
204 const TString layerDelimiter(
",");
205 const TString subDelimiter(
"|");
207 const size_t inputSize = GetNvar();
209 TObjArray* layerStrings = layoutString.Tokenize(layerDelimiter);
210 TIter nextLayer (layerStrings);
211 TObjString* layerString = (TObjString*)nextLayer ();
213 for (; layerString !=
nullptr; layerString = (TObjString*) nextLayer()) {
215 EActivationFunction activationFunction = EActivationFunction::kTanh;
217 TObjArray* subStrings = layerString->GetString().Tokenize(subDelimiter);
218 TIter nextToken (subStrings);
219 TObjString* token = (TObjString *) nextToken();
221 for (; token !=
nullptr; token = (TObjString *) nextToken()) {
226 TString strActFnc (token->GetString ());
227 if (strActFnc ==
"RELU") {
228 activationFunction = DNN::EActivationFunction::kRelu;
229 }
else if (strActFnc ==
"TANH") {
230 activationFunction = DNN::EActivationFunction::kTanh;
231 }
else if (strActFnc ==
"SYMMRELU") {
232 activationFunction = DNN::EActivationFunction::kSymmRelu;
233 }
else if (strActFnc ==
"SOFTSIGN") {
234 activationFunction = DNN::EActivationFunction::kSoftSign;
235 }
else if (strActFnc ==
"SIGMOID") {
236 activationFunction = DNN::EActivationFunction::kSigmoid;
237 }
else if (strActFnc ==
"LINEAR") {
238 activationFunction = DNN::EActivationFunction::kIdentity;
239 }
else if (strActFnc ==
"GAUSS") {
240 activationFunction = DNN::EActivationFunction::kGauss;
246 TString strNumNodes (token->GetString ());
248 strNumNodes.ReplaceAll (
"N", strN);
249 strNumNodes.ReplaceAll (
"n", strN);
250 TFormula fml (
"tmp",strNumNodes);
251 numNodes = fml.Eval (inputSize);
257 layout.push_back(std::make_pair(numNodes, activationFunction));
265 auto TMVA::MethodDNN::ParseKeyValueString(TString parseString,
270 KeyValueVector_t blockKeyValues;
271 const TString keyValueDelim (
"=");
273 TObjArray* blockStrings = parseString.Tokenize (blockDelim);
274 TIter nextBlock (blockStrings);
275 TObjString* blockString = (TObjString *) nextBlock();
277 for (; blockString !=
nullptr; blockString = (TObjString *) nextBlock())
279 blockKeyValues.push_back (std::map<TString,TString>());
280 std::map<TString,TString>& currentBlock = blockKeyValues.back ();
282 TObjArray* subStrings = blockString->GetString ().Tokenize (tokenDelim);
283 TIter nextToken (subStrings);
284 TObjString* token = (TObjString*)nextToken ();
286 for (; token !=
nullptr; token = (TObjString *)nextToken())
288 TString strKeyValue (token->GetString ());
289 int delimPos = strKeyValue.First (keyValueDelim.Data ());
293 TString strKey = TString (strKeyValue (0, delimPos));
295 TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ()));
297 strKey.Strip (TString::kBoth,
' ');
298 strValue.Strip (TString::kBoth,
' ');
300 currentBlock.insert (std::make_pair (strKey, strValue));
303 return blockKeyValues;
308 TString fetchValue (
const std::map<TString, TString>& keyValueMap, TString key)
311 std::map<TString, TString>::const_iterator it = keyValueMap.find (key);
312 if (it == keyValueMap.end()) {
320 template <
typename T>
321 T fetchValue(
const std::map<TString,TString>& keyValueMap,
328 int fetchValue(
const std::map<TString,TString>& keyValueMap,
332 TString value (fetchValue (keyValueMap, key));
336 return value.Atoi ();
342 double fetchValue (
const std::map<TString,TString>& keyValueMap,
343 TString key,
double defaultValue)
345 TString value (fetchValue (keyValueMap, key));
349 return value.Atof ();
355 TString fetchValue (
const std::map<TString,TString>& keyValueMap,
356 TString key, TString defaultValue)
358 TString value (fetchValue (keyValueMap, key));
368 bool fetchValue (
const std::map<TString,TString>& keyValueMap,
369 TString key,
bool defaultValue)
371 TString value (fetchValue (keyValueMap, key));
376 if (value ==
"TRUE" || value ==
"T" || value ==
"1") {
385 std::vector<double> fetchValue(
const std::map<TString, TString> & keyValueMap,
387 std::vector<double> defaultValue)
389 TString parseString (fetchValue (keyValueMap, key));
390 if (parseString ==
"") {
393 parseString.ToUpper ();
394 std::vector<double> values;
396 const TString tokenDelim (
"+");
397 TObjArray* tokenStrings = parseString.Tokenize (tokenDelim);
398 TIter nextToken (tokenStrings);
399 TObjString* tokenString = (TObjString*)nextToken ();
400 for (; tokenString != NULL; tokenString = (TObjString*)nextToken ()) {
401 std::stringstream sstr;
403 sstr << tokenString->GetString ().Data ();
404 sstr >> currentValue;
405 values.push_back (currentValue);
412 void TMVA::MethodDNN::ProcessOptions()
414 if (IgnoreEventsWithNegWeightsInTraining()) {
416 <<
"Will ignore negative events in training!"
420 if (fArchitectureString ==
"STANDARD") {
421 Log() << kERROR <<
"The STANDARD architecture has been deprecated. "
422 "Please use Architecture=CPU or Architecture=CPU."
423 "See the TMVA Users' Guide for instructions if you "
424 "encounter problems."
426 Log() << kFATAL <<
"The STANDARD architecture has been deprecated. "
427 "Please use Architecture=CPU or Architecture=CPU."
428 "See the TMVA Users' Guide for instructions if you "
429 "encounter problems."
433 if (fArchitectureString ==
"OPENCL") {
434 Log() << kERROR <<
"The OPENCL architecture has not been implemented yet. "
435 "Please use Architecture=CPU or Architecture=CPU for the "
436 "time being. See the TMVA Users' Guide for instructions "
437 "if you encounter problems."
439 Log() << kFATAL <<
"The OPENCL architecture has not been implemented yet. "
440 "Please use Architecture=CPU or Architecture=CPU for the "
441 "time being. See the TMVA Users' Guide for instructions "
442 "if you encounter problems."
446 if (fArchitectureString ==
"GPU") {
447 #ifndef DNNCUDA // Included only if DNNCUDA flag is _not_ set.
448 Log() << kERROR <<
"CUDA backend not enabled. Please make sure "
449 "you have CUDA installed and it was successfully "
452 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
453 "you have CUDA installed and it was successfully "
459 if (fArchitectureString ==
"CPU") {
460 #ifndef DNNCPU // Included only if DNNCPU flag is _not_ set.
461 Log() << kERROR <<
"Multi-core CPU backend not enabled. Please make sure "
462 "you have a BLAS implementation and it was successfully "
463 "detected by CMake as well that the imt CMake flag is set."
465 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
466 "you have a BLAS implementation and it was successfully "
467 "detected by CMake as well that the imt CMake flag is set."
476 fLayout = TMVA::MethodDNN::ParseLayoutString (fLayoutString);
477 size_t inputSize = GetNVariables ();
478 size_t outputSize = 1;
479 if (fAnalysisType == Types::kRegression && GetNTargets() != 0) {
480 outputSize = GetNTargets();
481 }
else if (fAnalysisType == Types::kMulticlass && DataInfo().GetNClasses() >= 2) {
482 outputSize = DataInfo().GetNClasses();
485 fNet.SetBatchSize(1);
486 fNet.SetInputWidth(inputSize);
488 auto itLayout = std::begin (fLayout);
489 auto itLayoutEnd = std::end (fLayout)-1;
490 for ( ; itLayout != itLayoutEnd; ++itLayout) {
491 fNet.AddLayer((*itLayout).first, (*itLayout).second);
493 fNet.AddLayer(outputSize, EActivationFunction::kIdentity);
499 fOutputFunction = EOutputFunction::kSigmoid;
500 if (fAnalysisType == Types::kClassification)
502 if (fErrorStrategy ==
"SUMOFSQUARES") {
503 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
505 if (fErrorStrategy ==
"CROSSENTROPY") {
506 fNet.SetLossFunction(ELossFunction::kCrossEntropy);
508 fOutputFunction = EOutputFunction::kSigmoid;
509 }
else if (fAnalysisType == Types::kRegression) {
510 if (fErrorStrategy !=
"SUMOFSQUARES") {
511 Log () << kWARNING <<
"For regression only SUMOFSQUARES is a valid "
512 <<
" neural net error function. Setting error function to "
513 <<
" SUMOFSQUARES now." << Endl;
515 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
516 fOutputFunction = EOutputFunction::kIdentity;
517 }
else if (fAnalysisType == Types::kMulticlass) {
518 if (fErrorStrategy ==
"SUMOFSQUARES") {
519 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
521 if (fErrorStrategy ==
"CROSSENTROPY") {
522 fNet.SetLossFunction(ELossFunction::kCrossEntropy);
524 if (fErrorStrategy ==
"MUTUALEXCLUSIVE") {
525 fNet.SetLossFunction(ELossFunction::kSoftmaxCrossEntropy);
527 fOutputFunction = EOutputFunction::kSoftmax;
534 if (fWeightInitializationString ==
"XAVIER") {
535 fWeightInitialization = DNN::EInitialization::kGauss;
537 else if (fWeightInitializationString ==
"XAVIERUNIFORM") {
538 fWeightInitialization = DNN::EInitialization::kUniform;
541 fWeightInitialization = DNN::EInitialization::kGauss;
549 GetNumValidationSamples();
551 KeyValueVector_t strategyKeyValues = ParseKeyValueString(fTrainingStrategyString,
555 std::cout <<
"Parsed Training DNN string " << fTrainingStrategyString << std::endl;
556 std::cout <<
"STring has size " << strategyKeyValues.size() << std::endl;
557 for (
auto& block : strategyKeyValues) {
558 TTrainingSettings settings;
560 settings.convergenceSteps = fetchValue(block,
"ConvergenceSteps", 100);
561 settings.batchSize = fetchValue(block,
"BatchSize", 30);
562 settings.testInterval = fetchValue(block,
"TestRepetitions", 7);
563 settings.weightDecay = fetchValue(block,
"WeightDecay", 0.0);
564 settings.learningRate = fetchValue(block,
"LearningRate", 1e-5);
565 settings.momentum = fetchValue(block,
"Momentum", 0.3);
566 settings.dropoutProbabilities = fetchValue(block,
"DropConfig",
567 std::vector<Double_t>());
569 TString regularization = fetchValue(block,
"Regularization",
571 if (regularization ==
"L1") {
572 settings.regularization = DNN::ERegularization::kL1;
573 }
else if (regularization ==
"L2") {
574 settings.regularization = DNN::ERegularization::kL2;
576 settings.regularization = DNN::ERegularization::kNone;
579 TString strMultithreading = fetchValue(block,
"Multithreading",
581 if (strMultithreading.BeginsWith (
"T")) {
582 settings.multithreading =
true;
584 settings.multithreading =
false;
587 fTrainingSettings.push_back(settings);
600 UInt_t TMVA::MethodDNN::GetNumValidationSamples()
602 Int_t nValidationSamples = 0;
603 UInt_t trainingSetSize = GetEventCollection(Types::kTraining).size();
607 if (fValidationSize.EndsWith(
"%")) {
609 TString intValStr = TString(fValidationSize.Strip(TString::kTrailing,
'%'));
611 if (intValStr.IsFloat()) {
612 Double_t valSizeAsDouble = fValidationSize.Atof() / 100.0;
613 nValidationSamples = GetEventCollection(Types::kTraining).size() * valSizeAsDouble;
615 Log() << kFATAL <<
"Cannot parse number \"" << fValidationSize
616 <<
"\". Expected string like \"20%\" or \"20.0%\"." << Endl;
618 }
else if (fValidationSize.IsFloat()) {
619 Double_t valSizeAsDouble = fValidationSize.Atof();
621 if (valSizeAsDouble < 1.0) {
623 nValidationSamples = GetEventCollection(Types::kTraining).size() * valSizeAsDouble;
626 nValidationSamples = valSizeAsDouble;
629 Log() << kFATAL <<
"Cannot parse number \"" << fValidationSize <<
"\". Expected string like \"0.2\" or \"100\"."
635 if (nValidationSamples < 0) {
636 Log() << kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is negative." << Endl;
639 if (nValidationSamples == 0) {
640 Log() << kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is zero." << Endl;
643 if (nValidationSamples >= (Int_t)trainingSetSize) {
644 Log() << kFATAL <<
"Validation size \"" << fValidationSize
645 <<
"\" is larger than or equal in size to training set (size=\"" << trainingSetSize <<
"\")." << Endl;
648 return nValidationSamples;
653 void TMVA::MethodDNN::Train()
655 if (fInteractive && fInteractive->NotInitialized()){
656 std::vector<TString> titles = {
"Error on training set",
"Error on test set"};
657 fInteractive->Init(titles);
662 for (TTrainingSettings & settings : fTrainingSettings) {
663 size_t nValidationSamples = GetNumValidationSamples();
664 size_t nTrainingSamples = GetEventCollection(Types::kTraining).size() - nValidationSamples;
665 size_t nTestSamples = nValidationSamples;
667 if (nTrainingSamples < settings.batchSize ||
668 nValidationSamples < settings.batchSize ||
669 nTestSamples < settings.batchSize) {
670 Log() << kFATAL <<
"Number of samples in the datasets are train: "
671 << nTrainingSamples <<
" valid: " << nValidationSamples
672 <<
" test: " << nTestSamples <<
". "
673 <<
"One of these is smaller than the batch size of "
674 << settings.batchSize <<
". Please increase the batch"
675 <<
" size to be at least the same size as the smallest"
676 <<
" of these values." << Endl;
680 if (fArchitectureString ==
"GPU") {
682 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
685 }
else if (fArchitectureString ==
"OpenCL") {
686 Log() << kFATAL <<
"OpenCL backend not yet supported." << Endl;
688 }
else if (fArchitectureString ==
"CPU") {
690 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
695 Log() << kINFO <<
"Using Standard Implementation.";
697 std::vector<Pattern> trainPattern;
698 std::vector<Pattern> testPattern;
700 size_t nValidationSamples = GetNumValidationSamples();
701 size_t nTrainingSamples = GetEventCollection(Types::kTraining).size() - nValidationSamples;
703 const std::vector<TMVA::Event *> &allData = GetEventCollection(Types::kTraining);
704 const std::vector<TMVA::Event *> eventCollectionTraining{allData.begin(), allData.begin() + nTrainingSamples};
705 const std::vector<TMVA::Event *> eventCollectionTesting{allData.begin() + nTrainingSamples, allData.end()};
707 for (
auto &event : eventCollectionTraining) {
708 const std::vector<Float_t>& values =
event->GetValues();
709 if (fAnalysisType == Types::kClassification) {
710 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
711 trainPattern.push_back(Pattern (values.begin(),
714 event->GetWeight()));
715 trainPattern.back().addInput(1.0);
716 }
else if (fAnalysisType == Types::kMulticlass) {
717 std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
718 oneHot[
event->GetClass()] = 1.0;
719 trainPattern.push_back(Pattern (values.begin(), values.end(),
720 oneHot.cbegin(), oneHot.cend(),
721 event->GetWeight()));
722 trainPattern.back().addInput(1.0);
724 const std::vector<Float_t>& targets =
event->GetTargets ();
725 trainPattern.push_back(Pattern(values.begin(),
729 event->GetWeight ()));
730 trainPattern.back ().addInput (1.0);
734 for (
auto &event : eventCollectionTesting) {
735 const std::vector<Float_t>& values =
event->GetValues();
736 if (fAnalysisType == Types::kClassification) {
737 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
738 testPattern.push_back(Pattern (values.begin(),
741 event->GetWeight()));
742 testPattern.back().addInput(1.0);
743 }
else if (fAnalysisType == Types::kMulticlass) {
744 std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
745 oneHot[
event->GetClass()] = 1.0;
746 testPattern.push_back(Pattern (values.begin(), values.end(),
747 oneHot.cbegin(), oneHot.cend(),
748 event->GetWeight()));
749 testPattern.back().addInput(1.0);
751 const std::vector<Float_t>& targets =
event->GetTargets ();
752 testPattern.push_back(Pattern(values.begin(),
756 event->GetWeight ()));
757 testPattern.back ().addInput (1.0);
762 std::vector<double> weights;
764 net.SetIpythonInteractive(fInteractive, &fExitFromTraining, &fIPyMaxIter, &fIPyCurrentIter);
766 net.setInputSize(fNet.GetInputWidth() + 1);
767 net.setOutputSize(fNet.GetOutputWidth() + 1);
769 for (
size_t i = 0; i < fNet.GetDepth(); i++) {
770 EActivationFunction f = fNet.GetLayer(i).GetActivationFunction();
771 EnumFunction g = EnumFunction::LINEAR;
773 case EActivationFunction::kIdentity: g = EnumFunction::LINEAR;
break;
774 case EActivationFunction::kRelu: g = EnumFunction::RELU;
break;
775 case EActivationFunction::kSigmoid: g = EnumFunction::SIGMOID;
break;
776 case EActivationFunction::kTanh: g = EnumFunction::TANH;
break;
777 case EActivationFunction::kSymmRelu: g = EnumFunction::SYMMRELU;
break;
778 case EActivationFunction::kSoftSign: g = EnumFunction::SOFTSIGN;
break;
779 case EActivationFunction::kGauss: g = EnumFunction::GAUSS;
break;
781 if (i < fNet.GetDepth() - 1) {
782 net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g));
784 ModeOutputValues h = ModeOutputValues::DIRECT;
785 switch(fOutputFunction) {
786 case EOutputFunction::kIdentity: h = ModeOutputValues::DIRECT;
break;
787 case EOutputFunction::kSigmoid: h = ModeOutputValues::SIGMOID;
break;
788 case EOutputFunction::kSoftmax: h = ModeOutputValues::SOFTMAX;
break;
790 net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g, h));
794 switch(fNet.GetLossFunction()) {
795 case ELossFunction::kMeanSquaredError:
796 net.setErrorFunction(ModeErrorFunction::SUMOFSQUARES);
798 case ELossFunction::kCrossEntropy:
799 net.setErrorFunction(ModeErrorFunction::CROSSENTROPY);
801 case ELossFunction::kSoftmaxCrossEntropy:
802 net.setErrorFunction(ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE);
806 switch(fWeightInitialization) {
807 case EInitialization::kGauss:
808 net.initializeWeights(WeightInitializationStrategy::XAVIER,
809 std::back_inserter(weights));
811 case EInitialization::kUniform:
812 net.initializeWeights(WeightInitializationStrategy::XAVIERUNIFORM,
813 std::back_inserter(weights));
816 net.initializeWeights(WeightInitializationStrategy::XAVIER,
817 std::back_inserter(weights));
822 for (
auto s : fTrainingSettings) {
824 EnumRegularization r = EnumRegularization::NONE;
825 switch(s.regularization) {
826 case ERegularization::kNone: r = EnumRegularization::NONE;
break;
827 case ERegularization::kL1: r = EnumRegularization::L1;
break;
828 case ERegularization::kL2: r = EnumRegularization::L2;
break;
831 Settings * settings =
new Settings(TString(), s.convergenceSteps, s.batchSize,
832 s.testInterval, s.weightDecay, r,
833 MinimizerType::fSteepest, s.learningRate,
834 s.momentum, 1, s.multithreading);
835 std::shared_ptr<Settings> ptrSettings(settings);
836 ptrSettings->setMonitoring (0);
838 <<
"Training with learning rate = " << ptrSettings->learningRate ()
839 <<
", momentum = " << ptrSettings->momentum ()
840 <<
", repetitions = " << ptrSettings->repetitions ()
843 ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()),
844 (idxSetting+1)*100.0/(fSettings.size ()));
846 const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
847 if (!dropConfig.empty ()) {
848 Log () << kINFO <<
"Drop configuration" << Endl
849 <<
" drop repetitions = " << ptrSettings->dropRepetitions()
854 for (
auto f : dropConfig) {
855 Log () << kINFO <<
" Layer " << idx <<
" = " << f << Endl;
858 Log () << kINFO << Endl;
860 DNN::Steepest minimizer(ptrSettings->learningRate(),
861 ptrSettings->momentum(),
862 ptrSettings->repetitions());
863 net.train(weights, trainPattern, testPattern, minimizer, *ptrSettings.get());
865 Log () << kINFO << Endl;
868 size_t weightIndex = 0;
869 for (
size_t l = 0; l < fNet.GetDepth(); l++) {
870 auto & layerWeights = fNet.GetLayer(l).GetWeights();
871 for (Int_t j = 0; j < layerWeights.GetNcols(); j++) {
872 for (Int_t i = 0; i < layerWeights.GetNrows(); i++) {
873 layerWeights(i,j) = weights[weightIndex];
877 auto & layerBiases = fNet.GetLayer(l).GetBiases();
879 for (Int_t i = 0; i < layerBiases.GetNrows(); i++) {
880 layerBiases(i,0) = weights[weightIndex];
884 for (Int_t i = 0; i < layerBiases.GetNrows(); i++) {
885 layerBiases(i,0) = 0.0;
889 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
895 void TMVA::MethodDNN::TrainGpu()
898 #ifdef DNNCUDA // Included only if DNNCUDA flag is set.
899 Log() << kINFO <<
"Start of neural network training on GPU." << Endl << Endl;
901 size_t nValidationSamples = GetNumValidationSamples();
902 size_t nTrainingSamples = GetEventCollection(Types::kTraining).size() - nValidationSamples;
903 size_t nTestSamples = nValidationSamples;
905 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." << Endl;
906 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." << Endl;
908 size_t trainingPhase = 1;
909 fNet.Initialize(fWeightInitialization);
910 for (TTrainingSettings & settings : fTrainingSettings) {
913 fInteractive->ClearGraphs();
916 TNet<TCuda<>> net(settings.batchSize, fNet);
917 net.SetWeightDecay(settings.weightDecay);
918 net.SetRegularization(settings.regularization);
922 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
923 for (
auto & p : dropoutVector) {
926 net.SetDropoutProbabilities(dropoutVector);
928 net.InitializeGradients();
929 auto testNet = net.CreateClone(settings.batchSize);
931 Log() << kINFO <<
"Training phase " << trainingPhase <<
" of "
932 << fTrainingSettings.size() <<
":" << Endl;
935 using DataLoader_t = TDataLoader<TMVAInput_t, TCuda<>>;
938 const std::vector<Event *> &allData = GetEventCollection(Types::kTraining);
939 const std::vector<Event *> trainingInputData =
940 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
941 const std::vector<Event *> testInputData =
942 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
944 if (trainingInputData.size() != nTrainingSamples) {
945 Log() << kFATAL <<
"Inconsistent training sample size" << Endl;
947 if (testInputData.size() != nTestSamples) {
948 Log() << kFATAL <<
"Inconsistent test sample size" << Endl;
952 TMVAInput_t trainingTuple = std::tie(trainingInputData, DataInfo());
953 TMVAInput_t testTuple = std::tie(testInputData, DataInfo());
954 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
955 net.GetBatchSize(), net.GetInputWidth(),
956 net.GetOutputWidth(), nThreads);
957 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
958 net.GetInputWidth(), net.GetOutputWidth(),
960 DNN::TGradientDescent<TCuda<>> minimizer(settings.learningRate,
961 settings.convergenceSteps,
962 settings.testInterval);
964 std::vector<TNet<TCuda<>>> nets{};
965 std::vector<TBatch<TCuda<>>> batches{};
966 nets.reserve(nThreads);
967 for (
size_t i = 0; i < nThreads; i++) {
969 for (
size_t j = 0; j < net.GetDepth(); j++)
971 auto &masterLayer = net.GetLayer(j);
972 auto &layer = nets.back().GetLayer(j);
973 TCuda<>::Copy(layer.GetWeights(),
974 masterLayer.GetWeights());
975 TCuda<>::Copy(layer.GetBiases(),
976 masterLayer.GetBiases());
980 bool converged =
false;
981 size_t stepCount = 0;
982 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
984 std::chrono::time_point<std::chrono::system_clock> start, end;
985 start = std::chrono::system_clock::now();
988 Log() << std::setw(10) <<
"Epoch" <<
" | "
989 << std::setw(12) <<
"Train Err."
990 << std::setw(12) <<
"Test Err."
991 << std::setw(12) <<
"GFLOP/s"
992 << std::setw(12) <<
"Conv. Steps" << Endl;
993 std::string separator(62,
'-');
994 Log() << separator << Endl;
1002 trainingData.Shuffle();
1003 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
1005 for (
size_t j = 0; j < nThreads; j++) {
1006 batches.reserve(nThreads);
1007 batches.push_back(trainingData.GetBatch());
1009 if (settings.momentum > 0.0) {
1010 minimizer.StepMomentum(net, nets, batches, settings.momentum);
1012 minimizer.Step(net, nets, batches);
1016 if ((stepCount % minimizer.GetTestInterval()) == 0) {
1019 Double_t testError = 0.0;
1020 for (
auto batch : testData) {
1021 auto inputMatrix = batch.GetInput();
1022 auto outputMatrix = batch.GetOutput();
1023 testError += testNet.Loss(inputMatrix, outputMatrix);
1025 testError /= (Double_t) (nTestSamples / settings.batchSize);
1028 fTrainHistory.AddValue(
"testError",stepCount,testError);
1030 end = std::chrono::system_clock::now();
1033 Double_t trainingError = 0.0;
1034 for (
auto batch : trainingData) {
1035 auto inputMatrix = batch.GetInput();
1036 auto outputMatrix = batch.GetOutput();
1037 trainingError += net.Loss(inputMatrix, outputMatrix);
1039 trainingError /= (Double_t) (nTrainingSamples / settings.batchSize);
1041 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1044 std::chrono::duration<double> elapsed_seconds = end - start;
1045 double seconds = elapsed_seconds.count();
1046 double nFlops = (double) (settings.testInterval * batchesInEpoch);
1047 nFlops *= net.GetNFlops() * 1e-9;
1049 converged = minimizer.HasConverged(testError);
1050 start = std::chrono::system_clock::now();
1053 fInteractive->AddPoint(stepCount, trainingError, testError);
1054 fIPyCurrentIter = 100.0 * minimizer.GetConvergenceCount()
1055 / minimizer.GetConvergenceSteps ();
1056 if (fExitFromTraining)
break;
1058 Log() << std::setw(10) << stepCount <<
" | "
1059 << std::setw(12) << trainingError
1060 << std::setw(12) << testError
1061 << std::setw(12) << nFlops / seconds
1062 << std::setw(12) << minimizer.GetConvergenceCount() << Endl;
1069 for (
size_t l = 0; l < net.GetDepth(); l++) {
1070 fNet.GetLayer(l).GetWeights() = (TMatrixT<Scalar_t>) net.GetLayer(l).GetWeights();
1071 fNet.GetLayer(l).GetBiases() = (TMatrixT<Scalar_t>) net.GetLayer(l).GetBiases();
1075 #else // DNNCUDA flag not set.
1077 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
1078 "you have CUDA installed and it was successfully "
1079 "detected by CMAKE." << Endl;
1085 void TMVA::MethodDNN::TrainCpu()
1088 #ifdef DNNCPU // Included only if DNNCPU flag is set.
1089 Log() << kINFO <<
"Start of neural network training on CPU." << Endl << Endl;
1091 size_t nValidationSamples = GetNumValidationSamples();
1092 size_t nTrainingSamples = GetEventCollection(Types::kTraining).size() - nValidationSamples;
1093 size_t nTestSamples = nValidationSamples;
1095 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." << Endl;
1096 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." << Endl;
1098 fNet.Initialize(fWeightInitialization);
1100 size_t trainingPhase = 1;
1101 for (TTrainingSettings & settings : fTrainingSettings) {
1104 fInteractive->ClearGraphs();
1107 Log() <<
"Training phase " << trainingPhase <<
" of "
1108 << fTrainingSettings.size() <<
":" << Endl;
1111 TNet<TCpu<>> net(settings.batchSize, fNet);
1112 net.SetWeightDecay(settings.weightDecay);
1113 net.SetRegularization(settings.regularization);
1116 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
1117 for (
auto & p : dropoutVector) {
1120 net.SetDropoutProbabilities(dropoutVector);
1121 net.InitializeGradients();
1122 auto testNet = net.CreateClone(settings.batchSize);
1124 using DataLoader_t = TDataLoader<TMVAInput_t, TCpu<>>;
1127 const std::vector<Event *> &allData = GetEventCollection(Types::kTraining);
1128 const std::vector<Event *> trainingInputData =
1129 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
1130 const std::vector<Event *> testInputData =
1131 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
1133 if (trainingInputData.size() != nTrainingSamples) {
1134 Log() << kFATAL <<
"Inconsistent training sample size" << Endl;
1136 if (testInputData.size() != nTestSamples) {
1137 Log() << kFATAL <<
"Inconsistent test sample size" << Endl;
1140 size_t nThreads = 1;
1141 TMVAInput_t trainingTuple = std::tie(trainingInputData, DataInfo());
1142 TMVAInput_t testTuple = std::tie(testInputData, DataInfo());
1143 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
1144 net.GetBatchSize(), net.GetInputWidth(),
1145 net.GetOutputWidth(), nThreads);
1146 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
1147 net.GetInputWidth(), net.GetOutputWidth(),
1149 DNN::TGradientDescent<TCpu<>> minimizer(settings.learningRate,
1150 settings.convergenceSteps,
1151 settings.testInterval);
1153 std::vector<TNet<TCpu<>>> nets{};
1154 std::vector<TBatch<TCpu<>>> batches{};
1155 nets.reserve(nThreads);
1156 for (
size_t i = 0; i < nThreads; i++) {
1157 nets.push_back(net);
1158 for (
size_t j = 0; j < net.GetDepth(); j++)
1160 auto &masterLayer = net.GetLayer(j);
1161 auto &layer = nets.back().GetLayer(j);
1162 TCpu<>::Copy(layer.GetWeights(),
1163 masterLayer.GetWeights());
1164 TCpu<>::Copy(layer.GetBiases(),
1165 masterLayer.GetBiases());
1169 bool converged =
false;
1170 size_t stepCount = 0;
1171 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
1173 std::chrono::time_point<std::chrono::system_clock> start, end;
1174 start = std::chrono::system_clock::now();
1176 if (!fInteractive) {
1177 Log() << std::setw(10) <<
"Epoch" <<
" | "
1178 << std::setw(12) <<
"Train Err."
1179 << std::setw(12) <<
"Test Err."
1180 << std::setw(12) <<
"GFLOP/s"
1181 << std::setw(12) <<
"Conv. Steps" << Endl;
1182 std::string separator(62,
'-');
1183 Log() << separator << Endl;
1190 trainingData.Shuffle();
1191 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
1193 for (
size_t j = 0; j < nThreads; j++) {
1194 batches.reserve(nThreads);
1195 batches.push_back(trainingData.GetBatch());
1197 if (settings.momentum > 0.0) {
1198 minimizer.StepMomentum(net, nets, batches, settings.momentum);
1200 minimizer.Step(net, nets, batches);
1204 if ((stepCount % minimizer.GetTestInterval()) == 0) {
1207 Double_t testError = 0.0;
1208 for (
auto batch : testData) {
1209 auto inputMatrix = batch.GetInput();
1210 auto outputMatrix = batch.GetOutput();
1211 auto weightMatrix = batch.GetWeights();
1212 testError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
1214 testError /= (Double_t) (nTestSamples / settings.batchSize);
1217 fTrainHistory.AddValue(
"testError",stepCount,testError);
1219 end = std::chrono::system_clock::now();
1222 Double_t trainingError = 0.0;
1223 for (
auto batch : trainingData) {
1224 auto inputMatrix = batch.GetInput();
1225 auto outputMatrix = batch.GetOutput();
1226 auto weightMatrix = batch.GetWeights();
1227 trainingError += net.Loss(inputMatrix, outputMatrix, weightMatrix);
1229 trainingError /= (Double_t) (nTrainingSamples / settings.batchSize);
1232 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1235 fInteractive->AddPoint(stepCount, trainingError, testError);
1236 fIPyCurrentIter = 100*(double)minimizer.GetConvergenceCount() /(double)settings.convergenceSteps;
1237 if (fExitFromTraining)
break;
1241 std::chrono::duration<double> elapsed_seconds = end - start;
1242 double seconds = elapsed_seconds.count();
1243 double nFlops = (double) (settings.testInterval * batchesInEpoch);
1244 nFlops *= net.GetNFlops() * 1e-9;
1246 converged = minimizer.HasConverged(testError);
1247 start = std::chrono::system_clock::now();
1250 fInteractive->AddPoint(stepCount, trainingError, testError);
1251 fIPyCurrentIter = 100.0 * minimizer.GetConvergenceCount()
1252 / minimizer.GetConvergenceSteps ();
1253 if (fExitFromTraining)
break;
1255 Log() << std::setw(10) << stepCount <<
" | "
1256 << std::setw(12) << trainingError
1257 << std::setw(12) << testError
1258 << std::setw(12) << nFlops / seconds
1259 << std::setw(12) << minimizer.GetConvergenceCount() << Endl;
1268 for (
size_t l = 0; l < net.GetDepth(); l++) {
1269 auto & layer = fNet.GetLayer(l);
1270 layer.GetWeights() = (TMatrixT<Scalar_t>) net.GetLayer(l).GetWeights();
1271 layer.GetBiases() = (TMatrixT<Scalar_t>) net.GetLayer(l).GetBiases();
1275 #else // DNNCPU flag not set.
1276 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
1277 "you have a BLAS implementation and it was successfully "
1278 "detected by CMake as well that the imt CMake flag is set." << Endl;
1284 Double_t TMVA::MethodDNN::GetMvaValue( Double_t* , Double_t* )
1286 size_t nVariables = GetEvent()->GetNVariables();
1287 Matrix_t X(1, nVariables);
1288 Matrix_t YHat(1, 1);
1290 const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
1291 for (
size_t i = 0; i < nVariables; i++) {
1292 X(0,i) = inputValues[i];
1295 fNet.Prediction(YHat, X, fOutputFunction);
1301 const std::vector<Float_t> & TMVA::MethodDNN::GetRegressionValues()
1303 size_t nVariables = GetEvent()->GetNVariables();
1304 Matrix_t X(1, nVariables);
1306 const Event *ev = GetEvent();
1307 const std::vector<Float_t>& inputValues = ev->GetValues();
1308 for (
size_t i = 0; i < nVariables; i++) {
1309 X(0,i) = inputValues[i];
1312 size_t nTargets = std::max(1u, ev->GetNTargets());
1313 Matrix_t YHat(1, nTargets);
1314 std::vector<Float_t> output(nTargets);
1315 auto net = fNet.CreateClone(1);
1316 net.Prediction(YHat, X, fOutputFunction);
1318 for (
size_t i = 0; i < nTargets; i++)
1319 output[i] = YHat(0, i);
1321 if (fRegressionReturnVal == NULL) {
1322 fRegressionReturnVal =
new std::vector<Float_t>();
1324 fRegressionReturnVal->clear();
1326 Event * evT =
new Event(*ev);
1327 for (
size_t i = 0; i < nTargets; ++i) {
1328 evT->SetTarget(i, output[i]);
1331 const Event* evT2 = GetTransformationHandler().InverseTransform(evT);
1332 for (
size_t i = 0; i < nTargets; ++i) {
1333 fRegressionReturnVal->push_back(evT2->GetTarget(i));
1336 return *fRegressionReturnVal;
1339 const std::vector<Float_t> & TMVA::MethodDNN::GetMulticlassValues()
1341 size_t nVariables = GetEvent()->GetNVariables();
1342 Matrix_t X(1, nVariables);
1343 Matrix_t YHat(1, DataInfo().GetNClasses());
1344 if (fMulticlassReturnVal == NULL) {
1345 fMulticlassReturnVal =
new std::vector<Float_t>(DataInfo().GetNClasses());
1348 const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
1349 for (
size_t i = 0; i < nVariables; i++) {
1350 X(0,i) = inputValues[i];
1353 fNet.Prediction(YHat, X, fOutputFunction);
1354 for (
size_t i = 0; i < (size_t) YHat.GetNcols(); i++) {
1355 (*fMulticlassReturnVal)[i] = YHat(0, i);
1357 return *fMulticlassReturnVal;
1362 void TMVA::MethodDNN::AddWeightsXMLTo(
void* parent )
const
1364 void* nn = gTools().xmlengine().NewChild(parent, 0,
"Weights");
1365 Int_t inputWidth = fNet.GetInputWidth();
1366 Int_t depth = fNet.GetDepth();
1367 char lossFunction =
static_cast<char>(fNet.GetLossFunction());
1368 gTools().xmlengine().NewAttr(nn, 0,
"InputWidth",
1369 gTools().StringFromInt(inputWidth));
1370 gTools().xmlengine().NewAttr(nn, 0,
"Depth", gTools().StringFromInt(depth));
1371 gTools().xmlengine().NewAttr(nn, 0,
"LossFunction", TString(lossFunction));
1372 gTools().xmlengine().NewAttr(nn, 0,
"OutputFunction",
1373 TString(static_cast<char>(fOutputFunction)));
1375 for (Int_t i = 0; i < depth; i++) {
1376 const auto& layer = fNet.GetLayer(i);
1377 auto layerxml = gTools().xmlengine().NewChild(nn, 0,
"Layer");
1378 int activationFunction =
static_cast<int>(layer.GetActivationFunction());
1379 gTools().xmlengine().NewAttr(layerxml, 0,
"ActivationFunction",
1380 TString::Itoa(activationFunction, 10));
1381 WriteMatrixXML(layerxml,
"Weights", layer.GetWeights());
1382 WriteMatrixXML(layerxml,
"Biases", layer.GetBiases());
1388 void TMVA::MethodDNN::ReadWeightsFromXML(
void* rootXML)
1390 auto netXML = gTools().GetChild(rootXML,
"Weights");
1396 fNet.SetBatchSize(1);
1398 size_t inputWidth, depth;
1399 gTools().ReadAttr(netXML,
"InputWidth", inputWidth);
1400 gTools().ReadAttr(netXML,
"Depth", depth);
1401 char lossFunctionChar;
1402 gTools().ReadAttr(netXML,
"LossFunction", lossFunctionChar);
1403 char outputFunctionChar;
1404 gTools().ReadAttr(netXML,
"OutputFunction", outputFunctionChar);
1406 fNet.SetInputWidth(inputWidth);
1407 fNet.SetLossFunction(static_cast<ELossFunction>(lossFunctionChar));
1408 fOutputFunction =
static_cast<EOutputFunction
>(outputFunctionChar);
1410 size_t previousWidth = inputWidth;
1411 auto layerXML = gTools().xmlengine().GetChild(netXML,
"Layer");
1412 for (
size_t i = 0; i < depth; i++) {
1414 EActivationFunction f;
1417 gTools().ReadAttr(layerXML,
"ActivationFunction", fString);
1418 f =
static_cast<EActivationFunction
>(fString.Atoi());
1422 auto matrixXML = gTools().GetChild(layerXML,
"Weights");
1423 gTools().ReadAttr(matrixXML,
"rows", width);
1425 fNet.AddLayer(width, f);
1426 TMatrixT<Double_t> weights(width, previousWidth);
1427 TMatrixT<Double_t> biases(width, 1);
1428 ReadMatrixXML(layerXML,
"Weights", weights);
1429 ReadMatrixXML(layerXML,
"Biases", biases);
1430 fNet.GetLayer(i).GetWeights() = weights;
1431 fNet.GetLayer(i).GetBiases() = biases;
1433 layerXML = gTools().GetNextChild(layerXML);
1434 previousWidth = width;
1440 void TMVA::MethodDNN::ReadWeightsFromStream( std::istream & )
1446 const TMVA::Ranking* TMVA::MethodDNN::CreateRanking()
1448 fRanking =
new Ranking( GetName(),
"Importance" );
1449 for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1450 fRanking->AddRank( Rank( GetInputLabel(ivar), 1.0));
1457 void TMVA::MethodDNN::MakeClassSpecific( std::ostream& ,
1458 const TString& )
const
1464 void TMVA::MethodDNN::GetHelpMessage()
const
1470 TString col = gConfig().WriteOptionsReference() ? TString() : gTools().Color(
"bold");
1471 TString colres = gConfig().WriteOptionsReference() ? TString() : gTools().Color(
"reset");
1474 Log() << col <<
"--- Short description:" << colres << Endl;
1476 Log() <<
"The DNN neural network is a feedforward" << Endl;
1477 Log() <<
"multilayer perceptron implementation. The DNN has a user-" << Endl;
1478 Log() <<
"defined hidden layer architecture, where the number of input (output)" << Endl;
1479 Log() <<
"nodes is determined by the input variables (output classes, i.e., " << Endl;
1480 Log() <<
"signal and one background, regression or multiclass). " << Endl;
1482 Log() << col <<
"--- Performance optimisation:" << colres << Endl;
1485 const char* txt =
"The DNN supports various options to improve performance in terms of training speed and \n \
1486 reduction of overfitting: \n \
1488 - different training settings can be stacked. Such that the initial training \n\
1489 is done with a large learning rate and a large drop out fraction whilst \n \
1490 in a later stage learning rate and drop out can be reduced. \n \
1493 initial training stage: 0.0 for the first layer, 0.5 for later layers. \n \
1494 later training stage: 0.1 or 0.0 for all layers \n \
1495 final training stage: 0.0] \n \
1496 Drop out is a technique where a at each training cycle a fraction of arbitrary \n \
1497 nodes is disabled. This reduces co-adaptation of weights and thus reduces overfitting. \n \
1498 - L1 and L2 regularization are available \n \
1500 [recommended 10 - 150] \n \
1501 Arbitrary mini-batch sizes can be chosen. \n \
1502 - Multithreading \n \
1503 [recommended: True] \n \
1504 Multithreading can be turned on. The minibatches are distributed to the available \n \
1505 cores. The algorithm is lock-free (\"Hogwild!\"-style) for each cycle. \n \
1509 - example: \"TANH|(N+30)*2,TANH|(N+30),LINEAR\" \n \
1511 . two hidden layers (separated by \",\") \n \
1512 . the activation function is TANH (other options: RELU, SOFTSIGN, LINEAR) \n \
1513 . the activation function for the output layer is LINEAR \n \
1514 . the first hidden layer has (N+30)*2 nodes where N is the number of input neurons \n \
1515 . the second hidden layer has N+30 nodes, where N is the number of input neurons \n \
1516 . the number of nodes in the output layer is determined by the number of output nodes \n \
1517 and can therefore not be chosen freely. \n \
1519 \"ErrorStrategy\": \n \
1521 The error of the neural net is determined by a sum-of-squares error function \n \
1522 For regression, this is the only possible choice. \n \
1524 The error of the neural net is determined by a cross entropy function. The \n \
1525 output values are automatically (internally) transformed into probabilities \n \
1526 using a sigmoid function. \n \
1527 For signal/background classification this is the default choice. \n \
1528 For multiclass using cross entropy more than one or no output classes \n \
1529 can be equally true or false (e.g. Event 0: A and B are true, Event 1: \n \
1530 A and C is true, Event 2: C is true, ...) \n \
1531 - MUTUALEXCLUSIVE \n \
1532 In multiclass settings, exactly one of the output classes can be true (e.g. either A or B or C) \n \
1534 \"WeightInitialization\" \n \
1537 \"Xavier Glorot & Yoshua Bengio\"-style of initializing the weights. The weights are chosen randomly \n \
1538 such that the variance of the values of the nodes is preserved for each layer. \n \
1539 - XAVIERUNIFORM \n \
1540 The same as XAVIER, but with uniformly distributed weights instead of gaussian weights \n \
1542 Random values scaled by the layer size \n \
1544 \"TrainingStrategy\" \n \
1545 - example: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \
1546 - explanation: two stacked training settings separated by \"|\" \n \
1547 . first training setting: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \
1548 . second training setting : \"LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \
1549 . LearningRate : \n \
1550 - recommended for classification: 0.1 initially, 1e-4 later \n \
1551 - recommended for regression: 1e-4 and less \n \
1553 preserve a fraction of the momentum for the next training batch [fraction = 0.0 - 1.0] \n \
1554 . Repetitions : \n \
1555 train \"Repetitions\" repetitions with the same minibatch before switching to the next one \n \
1556 . ConvergenceSteps : \n \
1557 Assume that convergence is reached after \"ConvergenceSteps\" cycles where no improvement \n \
1558 of the error on the test samples has been found. (Mind that only at each \"TestRepetitions\" \n \
1559 cycle the test samples are evaluated and thus the convergence is checked) \n \
1561 Size of the mini-batches. \n \
1562 . TestRepetitions \n \
1563 Perform testing the neural net on the test samples each \"TestRepetitions\" cycle \n \
1565 If \"Renormalize\" is set to L1 or L2, \"WeightDecay\" provides the renormalization factor \n \
1567 NONE, L1 (|w|) or L2 (w^2) \n \
1569 Drop a fraction of arbitrary nodes of each of the layers according to the values given \n \
1570 in the DropConfig. \n \
1571 [example: DropConfig=0.0+0.5+0.3 \n \
1572 meaning: drop no nodes in layer 0 (input layer), half of the nodes in layer 1 and 30% of the nodes \n \
1574 recommended: leave all the nodes turned on for the input layer (layer 0) \n \
1575 turn off half of the nodes in later layers for the initial training; leave all nodes \n \
1576 turned on (0.0) in later training stages] \n \
1577 . DropRepetitions \n \
1578 Each \"DropRepetitions\" cycle the configuration of which nodes are dropped is changed \n \
1579 [recommended : 1] \n \
1580 . Multithreading \n \
1581 turn on multithreading [recommended: True] \n \
1583 Log () << txt << Endl;