Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Adagrad.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Ravi Kiran S
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TAdagrad *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Adagrad Optimizer Class *
12  * *
13  * Authors (alphabetical): *
14  * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2018: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_ADAGRAD
28 #define TMVA_DNN_ADAGRAD
29 
30 #include "TMatrix.h"
31 #include "TMVA/DNN/Optimizer.h"
32 #include "TMVA/DNN/Functions.h"
33 
34 namespace TMVA {
35 namespace DNN {
36 
37 /** \class TAdagrad
38  * Adagrad Optimizer class
39  *
40  * This class represents the Adagrad Optimizer.
41  */
42 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43  typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45 public:
46  using Matrix_t = typename Architecture_t::Matrix_t;
47  using Scalar_t = typename Architecture_t::Scalar_t;
48 
49 protected:
50  Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
51 
52  std::vector<std::vector<Matrix_t>>
53  fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
54  std::vector<std::vector<Matrix_t>>
55  fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
56  std::vector<std::vector<Matrix_t>>
57  fWorkWeightTensor; ///< working tensor used to keep a temporary copy of weights or weight gradients
58  std::vector<std::vector<Matrix_t>>
59  fWorkBiasTensor; ///< working tensor used to keep a temporary copy of bias or bias gradients
60 
61  /*! Update the weights, given the current weight gradients. */
62  void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
63 
64  /*! Update the biases, given the current bias gradients. */
65  void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
66 
67 public:
68  /*! Constructor. */
69  TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
70 
71  /*! Destructor. */
72  ~TAdagrad() = default;
73 
74  /*! Getters */
75  Scalar_t GetEpsilon() const { return fEpsilon; }
76 
77  std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
78  std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
79 
80  std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
81  std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
82 };
83 
84 //
85 //
86 // The Adagrad Optimizer Class - Implementation
87 //_________________________________________________________________________________________________
88 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
89 TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
90  : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
91 {
92  std::vector<Layer_t *> &layers = deepNet.GetLayers();
93  const size_t layersNSlices = layers.size();
94  fPastSquaredWeightGradients.resize(layersNSlices);
95  fPastSquaredBiasGradients.resize(layersNSlices);
96  fWorkWeightTensor.resize(layersNSlices);
97  fWorkBiasTensor.resize(layersNSlices);
98 
99  for (size_t i = 0; i < layersNSlices; i++) {
100  const size_t weightsNSlices = (layers[i]->GetWeights()).size();
101 
102  // weight and weight gradients tensors should have same
103  Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
104 
105  for (size_t j = 0; j < weightsNSlices; j++) {
106  initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
107  }
108 
109  const size_t biasesNSlices = (layers[i]->GetBiases()).size();
110 
111  Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
112 
113  for (size_t j = 0; j < biasesNSlices; j++) {
114  initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
115  }
116 
117  Architecture_t::CreateWeightTensors(fWorkWeightTensor[i], layers[i]->GetWeights());
118  Architecture_t::CreateWeightTensors(fWorkBiasTensor[i], layers[i]->GetBiases());
119 
120  }
121 }
122 
123 //_________________________________________________________________________________________________
124 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
125 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
126  const std::vector<Matrix_t> &weightGradients) -> void
127 {
128  auto &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
129 
130 
131  const size_t weightsNSlices = weights.size();
132  assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
133 
134  for (size_t i = 0; i < weightsNSlices; i++) {
135 
136  auto &currentSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
137  // Vt = Vt-1 + currentSquaredWeightGradients
138  Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
139  Architecture_t::SquareElementWise(currentSquaredWeightGradients);
140  Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
141 
142  // updating the weights.
143  // theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
144 
145  auto &currentWeightUpdates = fWorkWeightTensor[layerIndex][i]; // reuse the work tensor for the weight updates now
146  Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
147  Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
148  Architecture_t::SqrtElementWise(currentWeightUpdates);
149  Architecture_t::ReciprocalElementWise(currentWeightUpdates);
150  Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
151  Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
152  }
153 }
154 
155 //_________________________________________________________________________________________________
156 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
157 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
158  const std::vector<Matrix_t> &biasGradients) -> void
159 {
160  std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
161 
162  const size_t biasesNSlices = biases.size();
163  assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
164  for (size_t i = 0; i < biasesNSlices; i++) {
165 
166  // Vt = Vt-1 + currentSquaredBiasGradients
167  auto &currentSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
168  Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
169  Architecture_t::SquareElementWise(currentSquaredBiasGradients);
170  Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
171 
172  // updating the biases.
173  // theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
174 
175  auto &currentBiasUpdates = fWorkBiasTensor[layerIndex][i];
176  Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
177  Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
178  Architecture_t::SqrtElementWise(currentBiasUpdates);
179  Architecture_t::ReciprocalElementWise(currentBiasUpdates);
180  Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
181  Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
182  }
183 }
184 
185 } // namespace DNN
186 } // namespace TMVA
187 
188 #endif