42 template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44 class TAdam :
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46 using Matrix_t =
typename Architecture_t::Matrix_t;
47 using Scalar_t =
typename Architecture_t::Scalar_t;
54 std::vector<std::vector<Matrix_t>> fFirstMomentWeights;
56 std::vector<std::vector<Matrix_t>> fFirstMomentBiases;
59 std::vector<std::vector<Matrix_t>> fSecondMomentWeights;
61 std::vector<std::vector<Matrix_t>> fSecondMomentBiases;
65 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
68 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
72 TAdam(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t beta1 = 0.9, Scalar_t beta2 = 0.999,
73 Scalar_t epsilon = 1e-7);
79 Scalar_t GetBeta1()
const {
return fBeta1; }
80 Scalar_t GetBeta2()
const {
return fBeta2; }
81 Scalar_t GetEpsilon()
const {
return fEpsilon; }
83 std::vector<std::vector<Matrix_t>> &GetFirstMomentWeights() {
return fFirstMomentWeights; }
84 std::vector<Matrix_t> &GetFirstMomentWeightsAt(
size_t i) {
return fFirstMomentWeights[i]; }
86 std::vector<std::vector<Matrix_t>> &GetFirstMomentBiases() {
return fFirstMomentBiases; }
87 std::vector<Matrix_t> &GetFirstMomentBiasesAt(
size_t i) {
return fFirstMomentBiases[i]; }
89 std::vector<std::vector<Matrix_t>> &GetSecondMomentWeights() {
return fSecondMomentWeights; }
90 std::vector<Matrix_t> &GetSecondMomentWeightsAt(
size_t i) {
return fSecondMomentWeights[i]; }
92 std::vector<std::vector<Matrix_t>> &GetSecondMomentBiases() {
return fSecondMomentBiases; }
93 std::vector<Matrix_t> &GetSecondMomentBiasesAt(
size_t i) {
return fSecondMomentBiases[i]; }
100 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
101 TAdam<Architecture_t, Layer_t, DeepNet_t>::TAdam(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t beta1,
102 Scalar_t beta2, Scalar_t epsilon)
103 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fBeta1(beta1), fBeta2(beta2),
106 std::vector<Layer_t *> &layers = deepNet.GetLayers();
107 const size_t layersNSlices = layers.size();
108 fFirstMomentWeights.resize(layersNSlices);
109 fFirstMomentBiases.resize(layersNSlices);
110 fSecondMomentWeights.resize(layersNSlices);
111 fSecondMomentBiases.resize(layersNSlices);
114 for (
size_t i = 0; i < layersNSlices; i++) {
116 Architecture_t::CreateWeightTensors( fFirstMomentWeights[i], layers[i]->GetWeights());
117 Architecture_t::CreateWeightTensors( fSecondMomentWeights[i], layers[i]->GetWeights());
119 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
121 for (
size_t j = 0; j < weightsNSlices; j++) {
122 initialize<Architecture_t>(fFirstMomentWeights[i][j], EInitialization::kZero);
123 initialize<Architecture_t>(fSecondMomentWeights[i][j], EInitialization::kZero);
126 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
128 Architecture_t::CreateWeightTensors( fFirstMomentBiases[i], layers[i]->GetBiases());
129 Architecture_t::CreateWeightTensors( fSecondMomentBiases[i], layers[i]->GetBiases());
131 for (
size_t j = 0; j < biasesNSlices; j++) {
132 initialize<Architecture_t>(fFirstMomentBiases[i][j], EInitialization::kZero);
133 initialize<Architecture_t>(fSecondMomentBiases[i][j], EInitialization::kZero);
139 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
140 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
141 const std::vector<Matrix_t> &weightGradients) ->
void
147 std::vector<Matrix_t> ¤tLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
148 std::vector<Matrix_t> ¤tLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
151 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
152 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
155 for (
size_t i = 0; i < weights.size(); i++) {
157 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
159 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
161 Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
162 alpha, this->GetEpsilon() );
167 template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
168 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
169 const std::vector<Matrix_t> &biasGradients) ->
void
171 std::vector<Matrix_t> ¤tLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
172 std::vector<Matrix_t> ¤tLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
175 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
176 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
179 for (
size_t i = 0; i < biases.size(); i++) {
181 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
183 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
185 Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
186 alpha, this->GetEpsilon() );