Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Adam.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Ravi Kiran S
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TAdam *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Adam Optimizer Class *
12  * *
13  * Authors (alphabetical): *
14  * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2018: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_ADAM
28 #define TMVA_DNN_ADAM
29 
30 #include "TMatrix.h"
31 #include "TMVA/DNN/Optimizer.h"
32 #include "TMVA/DNN/Functions.h"
33 
34 namespace TMVA {
35 namespace DNN {
36 
37 /** \class TAdam
38  * Adam Optimizer class
39  *
40  * This class represents the Adam Optimizer.
41  */
42 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43  typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdam : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45 public:
46  using Matrix_t = typename Architecture_t::Matrix_t;
47  using Scalar_t = typename Architecture_t::Scalar_t;
48 
49 protected:
50  Scalar_t fBeta1; ///< The Beta1 constant used by the optimizer.
51  Scalar_t fBeta2; ///< The Beta2 constant used by the optimizer.
52  Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
53 
54  std::vector<std::vector<Matrix_t>> fFirstMomentWeights; ///< The decaying average of the first moment of the past
55  /// weight gradients associated with the deep net.
56  std::vector<std::vector<Matrix_t>> fFirstMomentBiases; ///< The decaying average of the first moment of the past bias
57  /// gradients associated with the deep net.
58 
59  std::vector<std::vector<Matrix_t>> fSecondMomentWeights; ///< The decaying average of the second moment of the past
60  /// weight gradients associated with the deep net.
61  std::vector<std::vector<Matrix_t>> fSecondMomentBiases; ///< The decaying average of the second moment of the past
62  /// bias gradients associated with the deep net.
63 
64  /*! Update the weights, given the current weight gradients. */
65  void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
66 
67  /*! Update the biases, given the current bias gradients. */
68  void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
69 
70 public:
71  /*! Constructor. */
72  TAdam(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t beta1 = 0.9, Scalar_t beta2 = 0.999,
73  Scalar_t epsilon = 1e-7);
74 
75  /*! Destructor. */
76  ~TAdam() = default;
77 
78  /*! Getters */
79  Scalar_t GetBeta1() const { return fBeta1; }
80  Scalar_t GetBeta2() const { return fBeta2; }
81  Scalar_t GetEpsilon() const { return fEpsilon; }
82 
83  std::vector<std::vector<Matrix_t>> &GetFirstMomentWeights() { return fFirstMomentWeights; }
84  std::vector<Matrix_t> &GetFirstMomentWeightsAt(size_t i) { return fFirstMomentWeights[i]; }
85 
86  std::vector<std::vector<Matrix_t>> &GetFirstMomentBiases() { return fFirstMomentBiases; }
87  std::vector<Matrix_t> &GetFirstMomentBiasesAt(size_t i) { return fFirstMomentBiases[i]; }
88 
89  std::vector<std::vector<Matrix_t>> &GetSecondMomentWeights() { return fSecondMomentWeights; }
90  std::vector<Matrix_t> &GetSecondMomentWeightsAt(size_t i) { return fSecondMomentWeights[i]; }
91 
92  std::vector<std::vector<Matrix_t>> &GetSecondMomentBiases() { return fSecondMomentBiases; }
93  std::vector<Matrix_t> &GetSecondMomentBiasesAt(size_t i) { return fSecondMomentBiases[i]; }
94 };
95 
96 //
97 //
98 // The Adam Optimizer Class - Implementation
99 //_________________________________________________________________________________________________
100 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
101 TAdam<Architecture_t, Layer_t, DeepNet_t>::TAdam(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t beta1,
102  Scalar_t beta2, Scalar_t epsilon)
103  : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fBeta1(beta1), fBeta2(beta2),
104  fEpsilon(epsilon)
105 {
106  std::vector<Layer_t *> &layers = deepNet.GetLayers();
107  const size_t layersNSlices = layers.size();
108  fFirstMomentWeights.resize(layersNSlices);
109  fFirstMomentBiases.resize(layersNSlices);
110  fSecondMomentWeights.resize(layersNSlices);
111  fSecondMomentBiases.resize(layersNSlices);
112 
113 
114  for (size_t i = 0; i < layersNSlices; i++) {
115 
116  Architecture_t::CreateWeightTensors( fFirstMomentWeights[i], layers[i]->GetWeights());
117  Architecture_t::CreateWeightTensors( fSecondMomentWeights[i], layers[i]->GetWeights());
118 
119  const size_t weightsNSlices = (layers[i]->GetWeights()).size();
120 
121  for (size_t j = 0; j < weightsNSlices; j++) {
122  initialize<Architecture_t>(fFirstMomentWeights[i][j], EInitialization::kZero);
123  initialize<Architecture_t>(fSecondMomentWeights[i][j], EInitialization::kZero);
124  }
125 
126  const size_t biasesNSlices = (layers[i]->GetBiases()).size();
127 
128  Architecture_t::CreateWeightTensors( fFirstMomentBiases[i], layers[i]->GetBiases());
129  Architecture_t::CreateWeightTensors( fSecondMomentBiases[i], layers[i]->GetBiases());
130 
131  for (size_t j = 0; j < biasesNSlices; j++) {
132  initialize<Architecture_t>(fFirstMomentBiases[i][j], EInitialization::kZero);
133  initialize<Architecture_t>(fSecondMomentBiases[i][j], EInitialization::kZero);
134  }
135  }
136 }
137 
138 //_________________________________________________________________________________________________
139 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
140 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
141  const std::vector<Matrix_t> &weightGradients) -> void
142 {
143  // update of weights using Adam algorithm
144  // we use the formulation defined before section 2.1 in the original paper
145  // 'Adam: A method for stochastic optimization, D. Kingma, J. Ba, see https://arxiv.org/abs/1412.6980
146 
147  std::vector<Matrix_t> &currentLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
148  std::vector<Matrix_t> &currentLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
149 
150  // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
151  Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
152  (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
153 
154  /// Adam update of first and second momentum of the weights
155  for (size_t i = 0; i < weights.size(); i++) {
156  // Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
157  Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
158  // Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
159  Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
160  // Weight = Weight - alpha * Mt / (sqrt(Vt) + epsilon)
161  Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
162  alpha, this->GetEpsilon() );
163  }
164 }
165 
166 //_________________________________________________________________________________________________
167 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
168 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
169  const std::vector<Matrix_t> &biasGradients) -> void
170 {
171  std::vector<Matrix_t> &currentLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
172  std::vector<Matrix_t> &currentLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
173 
174  // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
175  Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
176  (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
177 
178  // updating of the biases.
179  for (size_t i = 0; i < biases.size(); i++) {
180  // Mt = beta1 * Mt-1 + (1-beta1) * BiasGradients
181  Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
182  // Vt = beta2 * Vt-1 + (1-beta2) * BiasGradients^2
183  Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
184  // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
185  Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
186  alpha, this->GetEpsilon() );
187  }
188 }
189 
190 } // namespace DNN
191 } // namespace TMVA
192 
193 #endif