27 #ifndef TMVA_DNN_ADAGRAD 
   28 #define TMVA_DNN_ADAGRAD 
   42 template <
typename Architecture_t, 
typename Layer_t = VGeneralLayer<Architecture_t>,
 
   43           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
 
   44 class TAdagrad : 
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
 
   46    using Matrix_t = 
typename Architecture_t::Matrix_t;
 
   47    using Scalar_t = 
typename Architecture_t::Scalar_t;
 
   52    std::vector<std::vector<Matrix_t>>
 
   53       fPastSquaredWeightGradients; 
 
   54    std::vector<std::vector<Matrix_t>>
 
   55       fPastSquaredBiasGradients; 
 
   56    std::vector<std::vector<Matrix_t>>
 
   58    std::vector<std::vector<Matrix_t>>
 
   62    void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights, 
const std::vector<Matrix_t> &weightGradients);
 
   65    void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases, 
const std::vector<Matrix_t> &biasGradients);
 
   69    TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
 
   72    ~TAdagrad() = 
default;
 
   75    Scalar_t GetEpsilon()
 const { 
return fEpsilon; }
 
   77    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { 
return fPastSquaredWeightGradients; }
 
   78    std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(
size_t i) { 
return fPastSquaredWeightGradients[i]; }
 
   80    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { 
return fPastSquaredBiasGradients; }
 
   81    std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(
size_t i) { 
return fPastSquaredBiasGradients[i]; }
 
   88 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
   89 TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
 
   90    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
 
   92    std::vector<Layer_t *> &layers = deepNet.GetLayers();
 
   93    const size_t layersNSlices = layers.size();
 
   94    fPastSquaredWeightGradients.resize(layersNSlices);
 
   95    fPastSquaredBiasGradients.resize(layersNSlices);
 
   96    fWorkWeightTensor.resize(layersNSlices);
 
   97    fWorkBiasTensor.resize(layersNSlices);
 
   99    for (
size_t i = 0; i < layersNSlices; i++) {
 
  100       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
 
  103       Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights()); 
 
  105       for (
size_t j = 0; j < weightsNSlices; j++) {
 
  106          initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
 
  109       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
 
  111       Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases()); 
 
  113       for (
size_t j = 0; j < biasesNSlices; j++) {
 
  114          initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
 
  117       Architecture_t::CreateWeightTensors(fWorkWeightTensor[i], layers[i]->GetWeights());
 
  118       Architecture_t::CreateWeightTensors(fWorkBiasTensor[i], layers[i]->GetBiases());
 
  124 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
  125 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
 
  126                                                                  const std::vector<Matrix_t> &weightGradients) -> 
void 
  128    auto ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
 
  131    const size_t weightsNSlices = weights.size();
 
  132    assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
 
  134    for (
size_t i = 0; i < weightsNSlices; i++) {
 
  136       auto ¤tSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
 
  138       Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
 
  139       Architecture_t::SquareElementWise(currentSquaredWeightGradients);
 
  140       Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
 
  145       auto ¤tWeightUpdates = fWorkWeightTensor[layerIndex][i]; 
 
  146       Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
 
  147       Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
 
  148       Architecture_t::SqrtElementWise(currentWeightUpdates);
 
  149       Architecture_t::ReciprocalElementWise(currentWeightUpdates);
 
  150       Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
 
  151       Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
 
  156 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
  157 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
 
  158                                                                 const std::vector<Matrix_t> &biasGradients) -> 
void 
  160    std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
 
  162    const size_t biasesNSlices = biases.size();
 
  163    assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
 
  164    for (
size_t i = 0; i < biasesNSlices; i++) {
 
  167       auto ¤tSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
 
  168       Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
 
  169       Architecture_t::SquareElementWise(currentSquaredBiasGradients);
 
  170       Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
 
  175       auto ¤tBiasUpdates = fWorkBiasTensor[layerIndex][i];  
 
  176       Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
 
  177       Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
 
  178       Architecture_t::SqrtElementWise(currentBiasUpdates);
 
  179       Architecture_t::ReciprocalElementWise(currentBiasUpdates);
 
  180       Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
 
  181       Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());