Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Adadelta.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Ravi Kiran S
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TAdadelta *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Adadelta Optimizer Class *
12  * *
13  * Authors (alphabetical): *
14  * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2018: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_ADADELTA
28 #define TMVA_DNN_ADADELTA
29 
30 #include "TMatrix.h"
31 #include "TMVA/DNN/Optimizer.h"
32 #include "TMVA/DNN/Functions.h"
33 
34 namespace TMVA {
35 namespace DNN {
36 
37 /** \class TAdadelta
38  * Adadelta Optimizer class
39  *
40  * This class represents the Adadelta Optimizer.
41  */
42 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43  typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdadelta : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45 public:
46  using Matrix_t = typename Architecture_t::Matrix_t;
47  using Scalar_t = typename Architecture_t::Scalar_t;
48 
49 protected:
50  Scalar_t fRho; ///< The Rho constant used by the optimizer.
51  Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
52  std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients; ///< The accumulation of the square of the past
53  /// weight gradients associated with the deep net.
54  std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients; ///< The accumulation of the square of the past bias
55  /// gradients associated with the deep net.
56 
57  std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates; ///< The accumulation of the square of the past weight
58  /// updates associated with the deep net.
59  std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates; ///< The accumulation of the square of the past bias
60  /// updates associated with the deep net.
61  std::vector<std::vector<Matrix_t>> fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
62  std::vector<std::vector<Matrix_t>> fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
63  std::vector<std::vector<Matrix_t>> fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
64  std::vector<std::vector<Matrix_t>> fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
65 
66  /*! Update the weights, given the current weight gradients. */
67  void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
68 
69  /*! Update the biases, given the current bias gradients. */
70  void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
71 
72 public:
73  /*! Constructor. */
74  TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
75 
76  /*! Destructor. */
77  ~TAdadelta() = default;
78 
79  /*! Getters */
80  Scalar_t GetRho() const { return fRho; }
81  Scalar_t GetEpsilon() const { return fEpsilon; }
82 
83  std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
84  std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
85 
86  std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
87  std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
88 
89  std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; }
90  std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; }
91 
92  std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; }
93  std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; }
94 };
95 
96 //
97 //
98 // The Adadelta Optimizer Class - Implementation
99 //_________________________________________________________________________________________________
100 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
101 TAdadelta<Architecture_t, Layer_t, DeepNet_t>::TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t rho,
102  Scalar_t epsilon)
103  : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
104 {
105  std::vector<Layer_t *> &layers = deepNet.GetLayers();
106  const size_t layersNSlices = layers.size();
107  fPastSquaredWeightGradients.resize(layersNSlices);
108  fPastSquaredBiasGradients.resize(layersNSlices);
109  fPastSquaredWeightUpdates.resize(layersNSlices);
110  fPastSquaredBiasUpdates.resize(layersNSlices);
111  fWorkWeightTensor1.resize(layersNSlices);
112  fWorkBiasTensor1.resize(layersNSlices);
113  fWorkWeightTensor2.resize(layersNSlices);
114  fWorkBiasTensor2.resize(layersNSlices);
115 
116  for (size_t i = 0; i < layersNSlices; i++) {
117  const size_t weightsNSlices = (layers[i]->GetWeights()).size();
118 
119  Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
120  Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights());
121 
122  for (size_t j = 0; j < weightsNSlices; j++) {
123  initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
124  initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
125  }
126 
127  const size_t biasesNSlices = (layers[i]->GetBiases()).size();
128 
129  Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
130  Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases());
131 
132  for (size_t j = 0; j < biasesNSlices; j++) {
133  initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
134  initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
135  }
136 
137  Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
138  Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
139  Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
140  Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
141  }
142 }
143 
144 //_________________________________________________________________________________________________
145 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
146 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
147  const std::vector<Matrix_t> &weightGradients) -> void
148 {
149  std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
150  std::vector<Matrix_t> &currentLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
151 
152  const size_t weightsNSlices = weights.size();
153  assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
154 
155  for (size_t i = 0; i < weightsNSlices; i++) {
156  // accumulation matrix used for temporary storing of the current accumulation
157  auto &accumulation = fWorkWeightTensor1[layerIndex][i];
158  auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
159 
160  // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
161  initialize<Architecture_t>(accumulation, EInitialization::kZero);
162 
163  Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
164  Architecture_t::SquareElementWise(currentSquaredWeightGradients);
165  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
166  Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
167  Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
168 
169 
170  // updating the weights.
171  // currentWeightUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
172 
173  // dummy1 = sqrt(Wt + epsilon)
174  auto &dummy1 = fWorkWeightTensor1[layerIndex][i]; // reuse working tensor
175  Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
176  Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
177  Architecture_t::SqrtElementWise(dummy1);
178 
179  auto &currentWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse the work tensor for the weight updates now
180 
181  Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
182  Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
183  Architecture_t::SqrtElementWise(currentWeightUpdates);
184  Architecture_t::ReciprocalElementWise(currentWeightUpdates);
185  Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
186  Architecture_t::Hadamard(currentWeightUpdates, dummy1);
187 
188  // theta = theta - learningRate * currentWeightUpdates
189  Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
190 
191  // Wt = rho * Wt-1 + (1-rho) * currentSquaredWeightUpdates
192  // re-use accumulation matrix used for temporary storing of the current accumulation
193  initialize<Architecture_t>(accumulation, EInitialization::kZero);
194  auto &currentSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse work tensor
195  Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
196  Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
197  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
198  Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
199  Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
200  }
201 }
202 
203 //_________________________________________________________________________________________________
204 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
205 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
206  const std::vector<Matrix_t> &biasGradients) -> void
207 {
208  std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
209  std::vector<Matrix_t> &currentLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
210 
211  const size_t biasesNSlices = biases.size();
212  assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
213  for (size_t i = 0; i < biasesNSlices; i++) {
214 
215  // accumulation matrix used for temporary storing of the current accumulation
216  auto &accumulation = fWorkBiasTensor1[layerIndex][i];
217 
218  // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
219  initialize<Architecture_t>(accumulation, EInitialization::kZero);
220 
221  auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
222  Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
223  Architecture_t::SquareElementWise(currentSquaredBiasGradients);
224  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
225  Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
226  Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
227 
228  // updating the biases.
229 
230  // currentBiasUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
231  // dummy1 = sqrt(Wt + epsilon)
232  auto &dummy1 = fWorkBiasTensor1[layerIndex][i]; // reuse working tensor
233  Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
234  Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
235  Architecture_t::SqrtElementWise(dummy1);
236 
237  auto &currentBiasUpdates = fWorkBiasTensor2[layerIndex][i];
238  Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
239  Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
240  Architecture_t::SqrtElementWise(currentBiasUpdates);
241  Architecture_t::ReciprocalElementWise(currentBiasUpdates);
242  Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
243  Architecture_t::Hadamard(currentBiasUpdates, dummy1);
244 
245  // theta = theta - learningRate * currentBiasUpdates
246  Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
247 
248 
249  // Wt = rho * Wt-1 + (1-rho) * currentSquaredBiasUpdates
250  // re-use accumulation matrix used for temporary storing of the current accumulation
251  initialize<Architecture_t>(accumulation, EInitialization::kZero);
252  auto &currentSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i]; // reuse work tensor
253  Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
254  Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
255  Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
256  Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
257  Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
258  }
259 }
260 
261 } // namespace DNN
262 } // namespace TMVA
263 
264 #endif