27 #ifndef TMVA_DNN_RMSPROP 
   28 #define TMVA_DNN_RMSPROP 
   42 template <
typename Architecture_t, 
typename Layer_t = VGeneralLayer<Architecture_t>,
 
   43           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
 
   44 class TRMSProp : 
public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
 
   46    using Matrix_t = 
typename Architecture_t::Matrix_t;
 
   47    using Scalar_t = 
typename Architecture_t::Scalar_t;
 
   53    std::vector<std::vector<Matrix_t>>
 
   54       fPastSquaredWeightGradients; 
 
   55    std::vector<std::vector<Matrix_t>>
 
   56       fPastSquaredBiasGradients; 
 
   58    std::vector<std::vector<Matrix_t>> fWeightUpdates; 
 
   59    std::vector<std::vector<Matrix_t>> fBiasUpdates;   
 
   60    std::vector<std::vector<Matrix_t>>
 
   62    std::vector<std::vector<Matrix_t>>
 
   64    std::vector<std::vector<Matrix_t>>
 
   66    std::vector<std::vector<Matrix_t>>
 
   70    void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights, 
const std::vector<Matrix_t> &weightGradients);
 
   73    void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases, 
const std::vector<Matrix_t> &biasGradients);
 
   77    TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
 
   78             Scalar_t epsilon = 1e-7);
 
   81    ~TRMSProp() = 
default;
 
   84    Scalar_t GetMomentum()
 const { 
return fMomentum; }
 
   85    Scalar_t GetRho()
 const { 
return fRho; }
 
   86    Scalar_t GetEpsilon()
 const { 
return fEpsilon; }
 
   88    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { 
return fPastSquaredWeightGradients; }
 
   89    std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(
size_t i) { 
return fPastSquaredWeightGradients[i]; }
 
   91    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { 
return fPastSquaredBiasGradients; }
 
   92    std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(
size_t i) { 
return fPastSquaredBiasGradients[i]; }
 
   94    std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { 
return fWeightUpdates; }
 
   95    std::vector<Matrix_t> &GetWeightUpdatesAt(
size_t i) { 
return fWeightUpdates[i]; }
 
   97    std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { 
return fBiasUpdates; }
 
   98    std::vector<Matrix_t> &GetBiasUpdatesAt(
size_t i) { 
return fBiasUpdates[i]; }
 
  105 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
  106 TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,
 
  107                                                        Scalar_t rho, Scalar_t epsilon)
 
  108    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
 
  111    std::vector<Layer_t *> &layers = deepNet.GetLayers();
 
  112    const size_t layersNSlices = layers.size();
 
  113    fPastSquaredWeightGradients.resize(layersNSlices);
 
  114    fPastSquaredBiasGradients.resize(layersNSlices);
 
  115    fWeightUpdates.resize(layersNSlices);
 
  116    fBiasUpdates.resize(layersNSlices);
 
  117    fWorkWeightTensor1.resize(layersNSlices);
 
  118    fWorkBiasTensor1.resize(layersNSlices);
 
  119    fWorkWeightTensor2.resize(layersNSlices);
 
  120    fWorkBiasTensor2.resize(layersNSlices);
 
  122    for (
size_t i = 0; i < layersNSlices; i++) {
 
  123       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
 
  125       Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
 
  126       Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
 
  128       for (
size_t j = 0; j < weightsNSlices; j++) {
 
  129          initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
 
  130          initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
 
  133       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
 
  135       Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases()); 
 
  136       Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases()); 
 
  138       for (
size_t j = 0; j < biasesNSlices; j++) {
 
  139          initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
 
  140          initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
 
  142       Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
 
  143       Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
 
  144       Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
 
  145       Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
 
  150 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
  151 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
 
  152                                                                  const std::vector<Matrix_t> &weightGradients) -> 
void 
  154    std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
 
  155    std::vector<Matrix_t> ¤tLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
 
  157    for (
size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
 
  160       auto &accumulation = fWorkWeightTensor1[layerIndex][k];
 
  161       auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
 
  164       initialize<Architecture_t>(accumulation, EInitialization::kZero);
 
  166       Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
 
  167       Architecture_t::SquareElementWise(currentSquaredWeightGradients);
 
  168       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
 
  169       Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
 
  170       Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
 
  173       initialize<Architecture_t>(accumulation, EInitialization::kZero);
 
  174       auto &dummy = fWorkWeightTensor2[layerIndex][k]; 
 
  175       Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
 
  176       Architecture_t::ConstAdd(dummy, this->GetEpsilon());
 
  177       Architecture_t::SqrtElementWise(dummy);
 
  178       Architecture_t::ReciprocalElementWise(dummy);
 
  179       Architecture_t::Hadamard(dummy, weightGradients[k]);
 
  181       Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
 
  182       Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
 
  183       Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
 
  188    for (
size_t i = 0; i < weights.size(); i++) {
 
  189       Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
 
  194 template <
typename Architecture_t, 
typename Layer_t, 
typename DeepNet_t>
 
  195 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
 
  196                                                                 const std::vector<Matrix_t> &biasGradients) -> 
void 
  198    std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
 
  199    std::vector<Matrix_t> ¤tLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
 
  201    for (
size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
 
  204       auto &accumulation = fWorkBiasTensor1[layerIndex][k];
 
  205       auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
 
  208       initialize<Architecture_t>(accumulation, EInitialization::kZero);
 
  209       Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
 
  210       Architecture_t::SquareElementWise(currentSquaredBiasGradients);
 
  211       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
 
  212       Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
 
  213       Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
 
  216       initialize<Architecture_t>(accumulation, EInitialization::kZero);
 
  217       auto &dummy = fWorkBiasTensor2[layerIndex][k]; 
 
  219       Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
 
  220       Architecture_t::ConstAdd(dummy, this->GetEpsilon());
 
  221       Architecture_t::SqrtElementWise(dummy);
 
  222       Architecture_t::ReciprocalElementWise(dummy);
 
  223       Architecture_t::Hadamard(dummy, biasGradients[k]);
 
  225       Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
 
  226       Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
 
  227       Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
 
  232    for (
size_t i = 0; i < biases.size(); i++) {
 
  233       Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);