27 #ifndef TMVA_DNN_ADAGRAD
28 #define TMVA_DNN_ADAGRAD
42 template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdagrad :
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46 using Matrix_t =
typename Architecture_t::Matrix_t;
47 using Scalar_t =
typename Architecture_t::Scalar_t;
52 std::vector<std::vector<Matrix_t>>
53 fPastSquaredWeightGradients;
54 std::vector<std::vector<Matrix_t>>
55 fPastSquaredBiasGradients;
56 std::vector<std::vector<Matrix_t>>
58 std::vector<std::vector<Matrix_t>>
62 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
65 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
69 TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
72 ~TAdagrad() =
default;
75 Scalar_t GetEpsilon()
const {
return fEpsilon; }
77 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() {
return fPastSquaredWeightGradients; }
78 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(
size_t i) {
return fPastSquaredWeightGradients[i]; }
80 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() {
return fPastSquaredBiasGradients; }
81 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(
size_t i) {
return fPastSquaredBiasGradients[i]; }
88 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
89 TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
90 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
92 std::vector<Layer_t *> &layers = deepNet.GetLayers();
93 const size_t layersNSlices = layers.size();
94 fPastSquaredWeightGradients.resize(layersNSlices);
95 fPastSquaredBiasGradients.resize(layersNSlices);
96 fWorkWeightTensor.resize(layersNSlices);
97 fWorkBiasTensor.resize(layersNSlices);
99 for (
size_t i = 0; i < layersNSlices; i++) {
100 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
103 Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
105 for (
size_t j = 0; j < weightsNSlices; j++) {
106 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
109 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
111 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
113 for (
size_t j = 0; j < biasesNSlices; j++) {
114 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
117 Architecture_t::CreateWeightTensors(fWorkWeightTensor[i], layers[i]->GetWeights());
118 Architecture_t::CreateWeightTensors(fWorkBiasTensor[i], layers[i]->GetBiases());
124 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
125 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
126 const std::vector<Matrix_t> &weightGradients) ->
void
128 auto ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
131 const size_t weightsNSlices = weights.size();
132 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
134 for (
size_t i = 0; i < weightsNSlices; i++) {
136 auto ¤tSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
138 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
139 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
140 Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
145 auto ¤tWeightUpdates = fWorkWeightTensor[layerIndex][i];
146 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
147 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
148 Architecture_t::SqrtElementWise(currentWeightUpdates);
149 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
150 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
151 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
156 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
157 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
158 const std::vector<Matrix_t> &biasGradients) ->
void
160 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
162 const size_t biasesNSlices = biases.size();
163 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
164 for (
size_t i = 0; i < biasesNSlices; i++) {
167 auto ¤tSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
168 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
169 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
170 Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
175 auto ¤tBiasUpdates = fWorkBiasTensor[layerIndex][i];
176 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
177 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
178 Architecture_t::SqrtElementWise(currentBiasUpdates);
179 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
180 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
181 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());