Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
SGD.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Ravi Kiran S
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TSGD *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Stochastic Batch Gradient Descent Optimizer Class *
12  * *
13  * Authors (alphabetical): *
14  * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2018: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_SGD
28 #define TMVA_DNN_SGD
29 
30 #include "TMatrix.h"
31 #include "TMVA/DNN/Optimizer.h"
32 #include "TMVA/DNN/Functions.h"
33 
34 namespace TMVA {
35 namespace DNN {
36 
37 /** \class TSGD
38  * Stochastic Batch Gradient Descent Optimizer class
39  *
40  * This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
41  * and nesterov momentum.
42  */
43 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44  typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45 class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46 public:
47  using Matrix_t = typename Architecture_t::Matrix_t;
48  using Scalar_t = typename Architecture_t::Scalar_t;
49 
50 protected:
51  Scalar_t fMomentum; ///< The momentum used for training.
52  std::vector<std::vector<Matrix_t>>
53  fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
54  std::vector<std::vector<Matrix_t>>
55  fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
56 
57  /*! Update the weights, given the current weight gradients. */
58  void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
59 
60  /*! Update the biases, given the current bias gradients. */
61  void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
62 
63 public:
64  /*! Constructor. */
65  TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
66 
67  /*! Destructor. */
68  ~TSGD() = default;
69 
70  /*! Getters */
71  Scalar_t GetMomentum() const { return fMomentum; }
72 
73  std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
74  std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
75 
76  std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
77  std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
78 };
79 
80 //
81 //
82 // The Stochastic Gradient Descent Optimizer Class - Implementation
83 //_________________________________________________________________________________________________
84 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
85 TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
86  : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
87 {
88  std::vector<Layer_t *> &layers = deepNet.GetLayers();
89  size_t layersNSlices = layers.size();
90  fPastWeightGradients.resize(layersNSlices);
91  fPastBiasGradients.resize(layersNSlices);
92 
93  for (size_t i = 0; i < layersNSlices; i++) {
94 
95  Architecture_t::CreateWeightTensors( fPastWeightGradients[i], layers[i]->GetWeights());
96  size_t weightsNSlices = fPastWeightGradients[i].size();
97  for (size_t j = 0; j < weightsNSlices; j++) {
98  initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
99  }
100 
101  Architecture_t::CreateWeightTensors( fPastBiasGradients[i], layers[i]->GetBiases());
102  size_t biasesNSlices = fPastBiasGradients[i].size();
103  for (size_t j = 0; j < biasesNSlices; j++) {
104  initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
105  }
106  }
107 }
108 
109 
110 
111 //_________________________________________________________________________________________________
112 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
113 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
114  const std::vector<Matrix_t> &weightGradients) -> void
115 {
116  // accumulating the current layer past weight gradients to include the current weight gradients.
117  // Vt = momentum * Vt-1 + currentGradients
118 
119  std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
120 
121  for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
122  Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
123  Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
124  }
125 
126  // updating the weights.
127  // theta = theta - learningRate * Vt
128  for (size_t i = 0; i < weights.size(); i++) {
129  Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
130  }
131 }
132 
133 //_________________________________________________________________________________________________
134 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
135 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
136  const std::vector<Matrix_t> &biasGradients) -> void
137 {
138  // accumulating the current layer past bias gradients to include the current bias gradients.
139  // Vt = momentum * Vt-1 + currentGradients
140 
141  std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
142 
143  for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
144  Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
145  Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
146  }
147 
148  // updating the biases
149  // theta = theta - learningRate * Vt
150  for (size_t i = 0; i < biases.size(); i++) {
151  Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
152  }
153 }
154 
155 } // namespace DNN
156 } // namespace TMVA
157 
158 #endif