Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RMSProp.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Ravi Kiran S
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TRMSProp *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * RMSProp Optimizer Class *
12  * *
13  * Authors (alphabetical): *
14  * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2018: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_RMSPROP
28 #define TMVA_DNN_RMSPROP
29 
30 #include "TMatrix.h"
31 #include "TMVA/DNN/Optimizer.h"
32 #include "TMVA/DNN/Functions.h"
33 
34 namespace TMVA {
35 namespace DNN {
36 
37 /** \class TRMSProp
38  * RMSProp Optimizer class
39  *
40  * This class represents the RMSProp Optimizer with options for applying momentum.
41  */
42 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43  typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45 public:
46  using Matrix_t = typename Architecture_t::Matrix_t;
47  using Scalar_t = typename Architecture_t::Scalar_t;
48 
49 protected:
50  Scalar_t fMomentum; ///< The momentum used for training.
51  Scalar_t fRho; ///< The Rho constant used by the optimizer.
52  Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
53  std::vector<std::vector<Matrix_t>>
54  fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
55  std::vector<std::vector<Matrix_t>>
56  fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
57 
58  std::vector<std::vector<Matrix_t>> fWeightUpdates; ///< The accumulation of the past Weights for performing updates.
59  std::vector<std::vector<Matrix_t>> fBiasUpdates; ///< The accumulation of the past Biases for performing updates.
60  std::vector<std::vector<Matrix_t>>
61  fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
62  std::vector<std::vector<Matrix_t>>
63  fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
64  std::vector<std::vector<Matrix_t>>
65  fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
66  std::vector<std::vector<Matrix_t>>
67  fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
68 
69  /*! Update the weights, given the current weight gradients. */
70  void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
71 
72  /*! Update the biases, given the current bias gradients. */
73  void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
74 
75 public:
76  /*! Constructor. */
77  TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
78  Scalar_t epsilon = 1e-7);
79 
80  /*! Destructor. */
81  ~TRMSProp() = default;
82 
83  /*! Getters */
84  Scalar_t GetMomentum() const { return fMomentum; }
85  Scalar_t GetRho() const { return fRho; }
86  Scalar_t GetEpsilon() const { return fEpsilon; }
87 
88  std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
89  std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
90 
91  std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
92  std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
93 
94  std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }
95  std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }
96 
97  std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }
98  std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }
99 };
100 
101 //
102 //
103 // The RMSProp Optimizer Class - Implementation
104 //_________________________________________________________________________________________________
105 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
106 TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,
107  Scalar_t rho, Scalar_t epsilon)
108  : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
109  fEpsilon(epsilon)
110 {
111  std::vector<Layer_t *> &layers = deepNet.GetLayers();
112  const size_t layersNSlices = layers.size();
113  fPastSquaredWeightGradients.resize(layersNSlices);
114  fPastSquaredBiasGradients.resize(layersNSlices);
115  fWeightUpdates.resize(layersNSlices);
116  fBiasUpdates.resize(layersNSlices);
117  fWorkWeightTensor1.resize(layersNSlices);
118  fWorkBiasTensor1.resize(layersNSlices);
119  fWorkWeightTensor2.resize(layersNSlices);
120  fWorkBiasTensor2.resize(layersNSlices);
121 
122  for (size_t i = 0; i < layersNSlices; i++) {
123  const size_t weightsNSlices = (layers[i]->GetWeights()).size();
124 
125  Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
126  Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
127 
128  for (size_t j = 0; j < weightsNSlices; j++) {
129  initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
130  initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
131  }
132 
133  const size_t biasesNSlices = (layers[i]->GetBiases()).size();
134 
135  Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
136  Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases());
137 
138  for (size_t j = 0; j < biasesNSlices; j++) {
139  initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
140  initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
141  }
142  Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
143  Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
144  Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
145  Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
146  }
147 }
148 
149 //_________________________________________________________________________________________________
150 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
151 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
152  const std::vector<Matrix_t> &weightGradients) -> void
153 {
154  std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
155  std::vector<Matrix_t> &currentLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
156 
157  for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
158 
159  // accumulation matrix used for temporary storing of the current accumulation
160  auto &accumulation = fWorkWeightTensor1[layerIndex][k];
161  auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
162 
163  // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
164  initialize<Architecture_t>(accumulation, EInitialization::kZero);
165 
166  Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
167  Architecture_t::SquareElementWise(currentSquaredWeightGradients);
168  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
169  Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
170  Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
171 
172  // Wt = momentum * Wt-1 + (learningRate * currentWeightGradients) / (sqrt(Vt + epsilon))
173  initialize<Architecture_t>(accumulation, EInitialization::kZero);
174  auto &dummy = fWorkWeightTensor2[layerIndex][k]; // reuse working tensor
175  Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
176  Architecture_t::ConstAdd(dummy, this->GetEpsilon());
177  Architecture_t::SqrtElementWise(dummy);
178  Architecture_t::ReciprocalElementWise(dummy);
179  Architecture_t::Hadamard(dummy, weightGradients[k]);
180 
181  Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
182  Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
183  Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
184  }
185 
186  // updating the weights.
187  // theta = theta - Wt
188  for (size_t i = 0; i < weights.size(); i++) {
189  Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
190  }
191 }
192 
193 //_________________________________________________________________________________________________
194 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
195 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
196  const std::vector<Matrix_t> &biasGradients) -> void
197 {
198  std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
199  std::vector<Matrix_t> &currentLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
200 
201  for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
202 
203  // accumulation matrix used for temporary storing of the current accumulation
204  auto &accumulation = fWorkBiasTensor1[layerIndex][k];
205  auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
206 
207  // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
208  initialize<Architecture_t>(accumulation, EInitialization::kZero);
209  Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
210  Architecture_t::SquareElementWise(currentSquaredBiasGradients);
211  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
212  Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
213  Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
214 
215  // Wt = momentum * Wt-1 + (learningRate * currentBiasGradients) / (sqrt(Vt + epsilon))
216  initialize<Architecture_t>(accumulation, EInitialization::kZero);
217  auto &dummy = fWorkBiasTensor2[layerIndex][k]; // reuse working tensor
218 
219  Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
220  Architecture_t::ConstAdd(dummy, this->GetEpsilon());
221  Architecture_t::SqrtElementWise(dummy);
222  Architecture_t::ReciprocalElementWise(dummy);
223  Architecture_t::Hadamard(dummy, biasGradients[k]);
224 
225  Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
226  Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
227  Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
228  }
229 
230  // updating the Biases.
231  // theta = theta - Wt
232  for (size_t i = 0; i < biases.size(); i++) {
233  Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);
234  }
235 }
236 
237 } // namespace DNN
238 } // namespace TMVA
239 
240 #endif