11 #ifndef TMVA_DNN_MINIMIZERS
12 #define TMVA_DNN_MINIMIZERS
14 #include "DataLoader.h"
15 #include "Functions.h"
53 template<
typename Architecture_t>
54 class TGradientDescent
57 using Scalar_t =
typename Architecture_t::Scalar_t;
58 using Matrix_t =
typename Architecture_t::Matrix_t;
63 size_t fConvergenceSteps;
65 size_t fConvergenceCount;
68 Scalar_t fTrainingError;
70 Scalar_t fLearningRate;
71 Scalar_t fMinimumError;
76 TGradientDescent(Scalar_t learningRate,
77 size_t convergenceSteps,
83 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
84 fConvergenceCount = 0;
90 template <
typename Data_t,
typename Net_t>
91 Scalar_t Train(
const Data_t & TrainingDataIn,
size_t nTrainingSamples,
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
size_t nThreads = 1);
96 template <
typename Data_t,
typename Net_t>
97 Scalar_t TrainMomentum(
const Data_t & TrainingDataIn,
size_t nTrainingSamples,
98 const Data_t & TestDataIn,
size_t nTestSamples,
99 Net_t & net, Scalar_t momentum,
size_t nThreads = 1);
106 template <
typename Net_t>
107 void Step(Net_t &net, Matrix_t &input,
const Matrix_t &output,
const Matrix_t &weights);
111 template <
typename Net_t>
112 Scalar_t StepLoss(Net_t &net, Matrix_t &input,
const Matrix_t &output,
const Matrix_t &weights);
120 template <
typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
123 std::vector<TBatch<Architecture_t>> &batches);
126 template <
typename Net_t>
127 void StepMomentum(Net_t &master,
128 std::vector<Net_t> &nets,
129 std::vector<TBatch<Architecture_t>> &batches,
131 template <
typename Net_t>
135 void StepNesterov(Net_t &master,
136 std::vector<Net_t> &nets,
137 std::vector<TBatch<Architecture_t>> &batches,
143 template <
typename Net_t>
144 void StepReducedWeights(Net_t &net, Matrix_t &input,
const Matrix_t &output);
148 template <
typename Net_t>
149 Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input,
const Matrix_t &output,
const Matrix_t &weights);
157 bool HasConverged(Scalar_t testError);
159 size_t GetConvergenceCount()
const {
return fConvergenceCount;}
160 size_t GetConvergenceSteps()
const {
return fConvergenceSteps;}
161 Scalar_t GetTrainingError()
const {
return fTrainingError;}
162 Scalar_t GetTestError()
const {
return fTestError;}
163 size_t GetTestInterval()
const {
return fTestInterval;}
165 void SetConvergenceSteps(
size_t steps) {fConvergenceSteps = steps;}
166 void SetTestInterval(
size_t interval) {fTestInterval = interval;}
167 void SetLearningRate(Scalar_t rate) {fLearningRate = rate;}
168 void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
174 template <
typename Architecture_t>
175 TGradientDescent<Architecture_t>::TGradientDescent()
176 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
177 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
183 template <
typename Architecture_t>
184 TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate,
size_t convergenceSteps,
size_t testInterval)
185 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
186 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
192 template<
typename Architecture_t>
193 template <
typename Data_t,
typename Net_t>
194 auto TGradientDescent<Architecture_t>::Train(
const Data_t & trainingData,
195 size_t nTrainingSamples,
196 const Data_t & testData,
205 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
208 net.GetOutputWidth(), nThreads);
209 auto testNet = net.CreateClone(nTestSamples);
210 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
211 testNet.GetBatchSize(),
212 testNet.GetInputWidth(),
213 net.GetOutputWidth());
214 std::vector<Net_t> nets{};
215 nets.reserve(nThreads);
216 for (
size_t i = 0; i < nThreads; i++) {
218 for (
size_t j = 0; j < net.GetDepth(); j++)
220 auto &masterLayer = net.GetLayer(j);
221 auto &layer = nets.back().GetLayer(j);
222 Architecture_t::Copy(layer.GetWeights(),
223 masterLayer.GetWeights());
224 Architecture_t::Copy(layer.GetBiases(),
225 masterLayer.GetBiases());
229 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
230 std::vector<TBatch<Architecture_t>> batches{};
231 batches.reserve(nThreads);
234 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
235 trainLoader.Shuffle();
236 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
238 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
239 Step(net, nets, batches);
243 auto b = *testLoader.begin();
244 auto inputMatrix = b.GetInput();
245 auto outputMatrix = b.GetOutput();
246 auto weightMatrix = b.GetWeights();
247 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
249 }
while (!HasConverged());
251 return fMinimumError;
255 template<
typename Architecture_t>
256 template <
typename Data_t,
typename Net_t>
257 auto TGradientDescent<Architecture_t>::TrainMomentum(
const Data_t & trainingData,
258 size_t nTrainingSamples,
259 const Data_t & testData,
269 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
272 net.GetOutputWidth(), nThreads);
273 auto testNet = net.CreateClone(net.GetBatchSize());
274 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
275 testNet.GetBatchSize(),
276 testNet.GetInputWidth(),
277 net.GetOutputWidth());
279 net.InitializeGradients();
280 std::vector<Net_t> nets{};
281 nets.reserve(nThreads);
282 for (
size_t i = 0; i < nThreads; i++) {
284 for (
size_t j = 0; j < net.GetDepth(); j++)
286 auto &masterLayer = net.GetLayer(j);
287 auto &layer = nets.back().GetLayer(j);
288 Architecture_t::Copy(layer.GetWeights(),
289 masterLayer.GetWeights());
290 Architecture_t::Copy(layer.GetBiases(),
291 masterLayer.GetBiases());
295 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
296 std::vector<TBatch<Architecture_t>> batches{};
297 batches.reserve(nThreads);
300 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
301 trainLoader.Shuffle();
302 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
304 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
305 if (momentum != 0.0) {
306 StepMomentum(net, nets, batches, momentum);
308 Step(net, nets, batches);
314 for (
size_t i = 0; i < batchesInEpoch; i++) {
315 auto b = testLoader.GetBatch();
316 auto inputMatrix = b.GetInput();
317 auto outputMatrix = b.GetOutput();
318 auto weightMatrix = b.GetWeights();
319 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
321 fTestError /= (Double_t)batchesInEpoch;
322 }
while (!HasConverged());
323 return fMinimumError;
327 template <
typename Architecture_t>
328 template <
typename Net_t>
329 void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input,
const Matrix_t &output,
330 const Matrix_t &weights)
332 net.Forward(input,
true);
333 net.Backward(input, output, weights);
335 for (
size_t i = 0; i < net.GetDepth(); i++)
337 auto &layer = net.GetLayer(i);
338 Architecture_t::ScaleAdd(layer.GetWeights(),
339 layer.GetWeightGradients(),
341 Architecture_t::ScaleAdd(layer.GetBiases(),
342 layer.GetBiasGradients(),
348 template <
typename Architecture_t>
349 template <
typename Net_t>
350 auto inline TGradientDescent<Architecture_t>::StepLoss(Net_t &net, Matrix_t &input,
const Matrix_t &output,
351 const Matrix_t &weights) -> Scalar_t
353 Scalar_t loss = net.Loss(input, output, weights);
354 net.Backward(input, output);
356 for (
size_t i = 0; i < net.GetDepth(); i++)
358 auto &layer = net.GetLayer(i);
359 Architecture_t::ScaleAdd(layer.GetWeights(),
360 layer.GetWeightGradients(),
362 Architecture_t::ScaleAdd(layer.GetBiases(),
363 layer.GetBiasGradients(),
370 template<
typename Architecture_t>
371 template <
typename Net_t>
372 void inline TGradientDescent<Architecture_t>::Step(
374 std::vector<Net_t> & nets,
375 std::vector<TBatch<Architecture_t>> & batches)
377 typename Architecture_t::Matrix_t dummy(0,0);
378 size_t depth = master.GetDepth();
381 for (
size_t j = 0; j < nets.size(); j++) {
382 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
385 for (
size_t i = 1; i < depth; i++)
387 for (
size_t j = 0; j < nets.size(); j++) {
388 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
392 for (
size_t j = 0; j < nets.size(); j++) {
393 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
394 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
395 batches[j].GetWeights());
398 for (
size_t i = depth - 1; i > 0; i--)
400 for (
size_t j = 0; j < nets.size(); j++) {
401 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
402 nets[j].GetLayer(i-1).GetOutput(),
403 nets[j].GetRegularization(),
404 nets[j].GetWeightDecay());
407 for (
size_t j = 0; j < nets.size(); j++) {
408 nets[j].GetLayer(0).Backward(dummy,
409 batches[j].GetInput(),
410 nets[j].GetRegularization(),
411 nets[j].GetWeightDecay());
414 for (
size_t j = 0; j < nets.size(); j++) {
415 for (
size_t i = 0; i < depth; i++)
417 auto &masterLayer = master.GetLayer(i);
418 auto &layer = nets[j].GetLayer(i);
419 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
420 layer.GetWeightGradients(),
422 Architecture_t::Copy(layer.GetWeights(),
423 masterLayer.GetWeights());
424 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
425 layer.GetBiasGradients(),
427 Architecture_t::Copy(layer.GetBiases(),
428 masterLayer.GetBiases());
434 template<
typename Architecture_t>
435 template <
typename Net_t>
436 void inline TGradientDescent<Architecture_t>::StepMomentum(
438 std::vector<Net_t> & nets,
439 std::vector<TBatch<Architecture_t>> & batches,
442 typename Architecture_t::Matrix_t dummy(0,0);
443 size_t depth = master.GetDepth();
446 for (
size_t j = 0; j < nets.size(); j++) {
447 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
450 for (
size_t i = 1; i < depth; i++)
452 for (
size_t j = 0; j < nets.size(); j++) {
453 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
457 for (
size_t j = 0; j < nets.size(); j++) {
458 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
459 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
460 batches[j].GetWeights());
463 for (
size_t i = depth - 1; i > 0; i--)
465 for (
size_t j = 0; j < nets.size(); j++) {
466 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
467 nets[j].GetLayer(i-1).GetOutput(),
468 nets[j].GetRegularization(),
469 nets[j].GetWeightDecay());
470 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
471 nets[j].GetLayer(i).GetWeightGradients(),
472 - fLearningRate / momentum);
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
474 nets[j].GetLayer(i).GetBiasGradients(),
475 - fLearningRate / momentum);
477 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478 master.GetLayer(i).GetWeightGradients(),
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481 master.GetLayer(i).GetBiasGradients(),
484 for (
size_t j = 0; j < nets.size(); j++) {
485 nets[j].GetLayer(0).Backward(dummy,
486 batches[j].GetInput(),
487 nets[j].GetRegularization(),
488 nets[j].GetWeightDecay());
489 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
490 nets[j].GetLayer(0).GetWeightGradients(),
491 - fLearningRate / momentum);
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
493 nets[j].GetLayer(0).GetBiasGradients(),
494 - fLearningRate / momentum);
497 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
498 master.GetLayer(0).GetWeightGradients(),
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
501 master.GetLayer(0).GetBiasGradients(),
504 for (
size_t i = 0; i < depth; i++)
506 auto &masterLayer = master.GetLayer(i);
507 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
508 masterLayer.GetWeightGradients(),
510 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
511 masterLayer.GetBiasGradients(),
513 for (
size_t j = 0; j < nets.size(); j++) {
514 auto &layer = nets[j].GetLayer(i);
515 Architecture_t::Copy(layer.GetWeights(),
516 masterLayer.GetWeights());
517 Architecture_t::Copy(layer.GetBiases(),
518 masterLayer.GetBiases());
524 template<
typename Architecture_t>
525 template <
typename Net_t>
526 void inline TGradientDescent<Architecture_t>::StepNesterov(
528 std::vector<Net_t> & nets,
529 std::vector<TBatch<Architecture_t>> & batches,
532 typename Architecture_t::Matrix_t dummy(0,0);
533 size_t depth = master.GetDepth();
536 for (
size_t j = 0; j < nets.size(); j++) {
537 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
540 for (
size_t i = 1; i < depth; i++)
542 for (
size_t j = 0; j < nets.size(); j++) {
543 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
548 for (
size_t j = 0; j < nets.size(); j++) {
549 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
550 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
551 batches[j].GetWeights());
555 for (
size_t i = depth - 1; i > 0; i--)
557 for (
size_t j = 0; j < nets.size(); j++) {
558 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
559 nets[j].GetLayer(i-1).GetOutput(),
560 nets[j].GetRegularization(),
561 nets[j].GetWeightDecay());
565 for (
size_t j = 0; j < nets.size(); j++) {
566 nets[j].GetLayer(0).Backward(dummy,
567 batches[j].GetInput(),
568 nets[j].GetRegularization(),
569 nets[j].GetWeightDecay());
572 for (
size_t i = 0; i < depth; i++)
574 auto &masterLayer = master.GetLayer(i);
575 for (
size_t j = 0; j < nets.size(); j++) {
576 auto &layer = nets[j].GetLayer(i);
577 Architecture_t::Copy(layer.GetWeights(),
578 masterLayer.GetWeights());
579 Architecture_t::Copy(layer.GetBiases(),
580 masterLayer.GetBiases());
581 Architecture_t::ScaleAdd(layer.GetWeights(),
582 masterLayer.GetWeightGradients(),
584 Architecture_t::ScaleAdd(layer.GetBiases(),
585 masterLayer.GetBiasGradients(),
588 for (
size_t j = 0; j < nets.size(); j++) {
589 auto &layer = nets[j].GetLayer(i);
590 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
591 layer.GetWeightGradients(),
592 - fLearningRate / momentum);
593 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
594 layer.GetBiasGradients(),
595 - fLearningRate / momentum);
597 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
598 masterLayer.GetWeightGradients(),
600 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
601 masterLayer.GetBiasGradients(),
603 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
604 masterLayer.GetWeightGradients(),
606 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
607 masterLayer.GetBiasGradients(),
613 template<
typename Architecture_t>
614 template <
typename Net_t>
615 void inline TGradientDescent<Architecture_t>::StepReducedWeights(
618 const Matrix_t &output)
620 net.Forward(input,
true);
621 net.Backward(input, output);
623 for (
size_t i = 0; i < net.GetDepth(); i++)
625 auto &layer = net.GetLayer(i);
626 Architecture_t::ScaleAdd(layer.GetWeights(),
627 layer.GetWeightGradients(),
630 Architecture_t::ScaleAdd(layer.GetBiases(),
631 layer.GetBiasGradients(),
638 template <
typename Architecture_t>
639 template <
typename Net_t>
640 auto inline TGradientDescent<Architecture_t>::StepReducedWeightsLoss(Net_t &net, Matrix_t &input,
641 const Matrix_t &output,
const Matrix_t &weights)
644 Scalar_t loss = net.Loss(input, output);
645 fTrainingError = loss;
646 net.Backward(input, output, weights);
648 for (
size_t i = 0; i < net.GetDepth(); i++)
650 auto &layer = net.GetLayer(i);
651 Architecture_t::ScaleAdd(layer.GetWeights(),
652 layer.GetWeightGradients(),
655 Architecture_t::ScaleAdd(layer.GetBiases(),
656 layer.GetBiasGradients(),
664 template<
typename Architecture_t>
665 bool inline TGradientDescent<Architecture_t>::HasConverged()
667 if (fTestError < fMinimumError * 0.999) {
668 fConvergenceCount = 0;
669 fMinimumError = fTestError;
674 return (fConvergenceCount >= fConvergenceSteps);
678 template<
typename Architecture_t>
679 bool inline TGradientDescent<Architecture_t>::HasConverged(Scalar_t testError)
681 fTestError = testError;
682 if (fTestError < fMinimumError * 0.999) {
683 fConvergenceCount = 0;
684 fMinimumError = fTestError;
686 fConvergenceCount += fTestInterval;
688 return (fConvergenceCount >= fConvergenceSteps);