27 #ifndef TMVA_DNN_RMSPROP
28 #define TMVA_DNN_RMSPROP
42 template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TRMSProp :
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46 using Matrix_t =
typename Architecture_t::Matrix_t;
47 using Scalar_t =
typename Architecture_t::Scalar_t;
53 std::vector<std::vector<Matrix_t>>
54 fPastSquaredWeightGradients;
55 std::vector<std::vector<Matrix_t>>
56 fPastSquaredBiasGradients;
58 std::vector<std::vector<Matrix_t>> fWeightUpdates;
59 std::vector<std::vector<Matrix_t>> fBiasUpdates;
60 std::vector<std::vector<Matrix_t>>
62 std::vector<std::vector<Matrix_t>>
64 std::vector<std::vector<Matrix_t>>
66 std::vector<std::vector<Matrix_t>>
70 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
73 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
77 TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
78 Scalar_t epsilon = 1e-7);
81 ~TRMSProp() =
default;
84 Scalar_t GetMomentum()
const {
return fMomentum; }
85 Scalar_t GetRho()
const {
return fRho; }
86 Scalar_t GetEpsilon()
const {
return fEpsilon; }
88 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() {
return fPastSquaredWeightGradients; }
89 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(
size_t i) {
return fPastSquaredWeightGradients[i]; }
91 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() {
return fPastSquaredBiasGradients; }
92 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(
size_t i) {
return fPastSquaredBiasGradients[i]; }
94 std::vector<std::vector<Matrix_t>> &GetWeightUpdates() {
return fWeightUpdates; }
95 std::vector<Matrix_t> &GetWeightUpdatesAt(
size_t i) {
return fWeightUpdates[i]; }
97 std::vector<std::vector<Matrix_t>> &GetBiasUpdates() {
return fBiasUpdates; }
98 std::vector<Matrix_t> &GetBiasUpdatesAt(
size_t i) {
return fBiasUpdates[i]; }
105 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
106 TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,
107 Scalar_t rho, Scalar_t epsilon)
108 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
111 std::vector<Layer_t *> &layers = deepNet.GetLayers();
112 const size_t layersNSlices = layers.size();
113 fPastSquaredWeightGradients.resize(layersNSlices);
114 fPastSquaredBiasGradients.resize(layersNSlices);
115 fWeightUpdates.resize(layersNSlices);
116 fBiasUpdates.resize(layersNSlices);
117 fWorkWeightTensor1.resize(layersNSlices);
118 fWorkBiasTensor1.resize(layersNSlices);
119 fWorkWeightTensor2.resize(layersNSlices);
120 fWorkBiasTensor2.resize(layersNSlices);
122 for (
size_t i = 0; i < layersNSlices; i++) {
123 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
125 Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
126 Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
128 for (
size_t j = 0; j < weightsNSlices; j++) {
129 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
130 initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
133 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
135 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
136 Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases());
138 for (
size_t j = 0; j < biasesNSlices; j++) {
139 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
140 initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
142 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
143 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
144 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
145 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
150 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
151 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
152 const std::vector<Matrix_t> &weightGradients) ->
void
154 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
155 std::vector<Matrix_t> ¤tLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
157 for (
size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
160 auto &accumulation = fWorkWeightTensor1[layerIndex][k];
161 auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
164 initialize<Architecture_t>(accumulation, EInitialization::kZero);
166 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
167 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
168 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
169 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
170 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
173 initialize<Architecture_t>(accumulation, EInitialization::kZero);
174 auto &dummy = fWorkWeightTensor2[layerIndex][k];
175 Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
176 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
177 Architecture_t::SqrtElementWise(dummy);
178 Architecture_t::ReciprocalElementWise(dummy);
179 Architecture_t::Hadamard(dummy, weightGradients[k]);
181 Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
182 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
183 Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
188 for (
size_t i = 0; i < weights.size(); i++) {
189 Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
194 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
195 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
196 const std::vector<Matrix_t> &biasGradients) ->
void
198 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
199 std::vector<Matrix_t> ¤tLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
201 for (
size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
204 auto &accumulation = fWorkBiasTensor1[layerIndex][k];
205 auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
208 initialize<Architecture_t>(accumulation, EInitialization::kZero);
209 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
210 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
211 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
212 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
213 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
216 initialize<Architecture_t>(accumulation, EInitialization::kZero);
217 auto &dummy = fWorkBiasTensor2[layerIndex][k];
219 Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
220 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
221 Architecture_t::SqrtElementWise(dummy);
222 Architecture_t::ReciprocalElementWise(dummy);
223 Architecture_t::Hadamard(dummy, biasGradients[k]);
225 Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
226 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
227 Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
232 for (
size_t i = 0; i < biases.size(); i++) {
233 Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);