27 #ifndef TMVA_DNN_ADADELTA
28 #define TMVA_DNN_ADADELTA
42 template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdadelta :
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46 using Matrix_t =
typename Architecture_t::Matrix_t;
47 using Scalar_t =
typename Architecture_t::Scalar_t;
52 std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients;
54 std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients;
57 std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates;
59 std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates;
61 std::vector<std::vector<Matrix_t>> fWorkWeightTensor1;
62 std::vector<std::vector<Matrix_t>> fWorkBiasTensor1;
63 std::vector<std::vector<Matrix_t>> fWorkWeightTensor2;
64 std::vector<std::vector<Matrix_t>> fWorkBiasTensor2;
67 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
70 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
74 TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
77 ~TAdadelta() =
default;
80 Scalar_t GetRho()
const {
return fRho; }
81 Scalar_t GetEpsilon()
const {
return fEpsilon; }
83 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() {
return fPastSquaredWeightGradients; }
84 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(
size_t i) {
return fPastSquaredWeightGradients[i]; }
86 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() {
return fPastSquaredBiasGradients; }
87 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(
size_t i) {
return fPastSquaredBiasGradients[i]; }
89 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() {
return fPastSquaredWeightUpdates; }
90 std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(
size_t i) {
return fPastSquaredWeightUpdates[i]; }
92 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() {
return fPastSquaredBiasUpdates; }
93 std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(
size_t i) {
return fPastSquaredBiasUpdates[i]; }
100 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
101 TAdadelta<Architecture_t, Layer_t, DeepNet_t>::TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t rho,
103 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
105 std::vector<Layer_t *> &layers = deepNet.GetLayers();
106 const size_t layersNSlices = layers.size();
107 fPastSquaredWeightGradients.resize(layersNSlices);
108 fPastSquaredBiasGradients.resize(layersNSlices);
109 fPastSquaredWeightUpdates.resize(layersNSlices);
110 fPastSquaredBiasUpdates.resize(layersNSlices);
111 fWorkWeightTensor1.resize(layersNSlices);
112 fWorkBiasTensor1.resize(layersNSlices);
113 fWorkWeightTensor2.resize(layersNSlices);
114 fWorkBiasTensor2.resize(layersNSlices);
116 for (
size_t i = 0; i < layersNSlices; i++) {
117 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
119 Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
120 Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights());
122 for (
size_t j = 0; j < weightsNSlices; j++) {
123 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
124 initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
127 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
129 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
130 Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases());
132 for (
size_t j = 0; j < biasesNSlices; j++) {
133 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
134 initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
137 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
138 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
139 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
140 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
145 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
146 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
147 const std::vector<Matrix_t> &weightGradients) ->
void
149 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
150 std::vector<Matrix_t> ¤tLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
152 const size_t weightsNSlices = weights.size();
153 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
155 for (
size_t i = 0; i < weightsNSlices; i++) {
157 auto &accumulation = fWorkWeightTensor1[layerIndex][i];
158 auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
161 initialize<Architecture_t>(accumulation, EInitialization::kZero);
163 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
164 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
165 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
166 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
167 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
174 auto &dummy1 = fWorkWeightTensor1[layerIndex][i];
175 Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
176 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
177 Architecture_t::SqrtElementWise(dummy1);
179 auto ¤tWeightUpdates = fWorkWeightTensor2[layerIndex][i];
181 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
182 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
183 Architecture_t::SqrtElementWise(currentWeightUpdates);
184 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
185 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
186 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
189 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
193 initialize<Architecture_t>(accumulation, EInitialization::kZero);
194 auto ¤tSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i];
195 Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
196 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
197 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
198 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
199 Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
204 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
205 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
206 const std::vector<Matrix_t> &biasGradients) ->
void
208 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
209 std::vector<Matrix_t> ¤tLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
211 const size_t biasesNSlices = biases.size();
212 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
213 for (
size_t i = 0; i < biasesNSlices; i++) {
216 auto &accumulation = fWorkBiasTensor1[layerIndex][i];
219 initialize<Architecture_t>(accumulation, EInitialization::kZero);
221 auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
222 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
223 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
224 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
225 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
226 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
232 auto &dummy1 = fWorkBiasTensor1[layerIndex][i];
233 Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
234 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
235 Architecture_t::SqrtElementWise(dummy1);
237 auto ¤tBiasUpdates = fWorkBiasTensor2[layerIndex][i];
238 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
239 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
240 Architecture_t::SqrtElementWise(currentBiasUpdates);
241 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
242 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
243 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
246 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
251 initialize<Architecture_t>(accumulation, EInitialization::kZero);
252 auto ¤tSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i];
253 Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
254 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
255 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
256 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
257 Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);