11 #ifndef TMVA_DNN_MINIMIZERS 
   12 #define TMVA_DNN_MINIMIZERS 
   14 #include "DataLoader.h" 
   15 #include "Functions.h" 
   53 template<
typename Architecture_t>
 
   54 class TGradientDescent
 
   57    using Scalar_t = 
typename Architecture_t::Scalar_t;
 
   58    using Matrix_t = 
typename Architecture_t::Matrix_t;
 
   63    size_t   fConvergenceSteps; 
 
   65    size_t   fConvergenceCount; 
 
   68    Scalar_t fTrainingError;
 
   70    Scalar_t fLearningRate; 
 
   71    Scalar_t fMinimumError; 
 
   76    TGradientDescent(Scalar_t learningRate,
 
   77                     size_t   convergenceSteps,
 
   83       fMinimumError = std::numeric_limits<Scalar_t>::infinity();
 
   84       fConvergenceCount = 0;
 
   90    template <
typename Data_t, 
typename Net_t>
 
   91    Scalar_t Train(
const Data_t & TrainingDataIn, 
size_t nTrainingSamples,
 
   92                   const Data_t & TestDataIn, 
size_t nTestSamples,
 
   93                   Net_t & net, 
size_t nThreads = 1);
 
   96    template <
typename Data_t, 
typename Net_t>
 
   97    Scalar_t TrainMomentum(
const Data_t & TrainingDataIn, 
size_t nTrainingSamples,
 
   98                           const Data_t & TestDataIn, 
size_t nTestSamples,
 
   99                           Net_t & net, Scalar_t momentum, 
size_t nThreads = 1);
 
  106    template <
typename Net_t>
 
  107    void Step(Net_t &net, Matrix_t &input, 
const Matrix_t &output, 
const Matrix_t &weights);
 
  111    template <
typename Net_t>
 
  112    Scalar_t StepLoss(Net_t &net, Matrix_t &input, 
const Matrix_t &output, 
const Matrix_t &weights);
 
  120    template <
typename Net_t>
 
  121    void Step(Net_t &master,
 
  122              std::vector<Net_t> &nets,
 
  123              std::vector<TBatch<Architecture_t>> &batches);
 
  126    template <
typename Net_t>
 
  127    void StepMomentum(Net_t &master,
 
  128                      std::vector<Net_t> &nets,
 
  129                      std::vector<TBatch<Architecture_t>> &batches,
 
  131    template <
typename Net_t>
 
  135    void StepNesterov(Net_t &master,
 
  136                      std::vector<Net_t> &nets,
 
  137                      std::vector<TBatch<Architecture_t>> &batches,
 
  143    template <
typename Net_t>
 
  144    void StepReducedWeights(Net_t &net, Matrix_t &input, 
const Matrix_t &output);
 
  148    template <
typename Net_t>
 
  149    Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, 
const Matrix_t &output, 
const Matrix_t &weights);
 
  157    bool HasConverged(Scalar_t testError);
 
  159    size_t   GetConvergenceCount()
 const {
return fConvergenceCount;}
 
  160    size_t   GetConvergenceSteps()
 const {
return fConvergenceSteps;}
 
  161    Scalar_t GetTrainingError()
 const {
return fTrainingError;}
 
  162    Scalar_t GetTestError()
 const     {
return fTestError;}
 
  163    size_t   GetTestInterval()
 const  {
return fTestInterval;}
 
  165    void SetConvergenceSteps(
size_t steps) {fConvergenceSteps = steps;}
 
  166    void SetTestInterval(
size_t interval)  {fTestInterval = interval;}
 
  167    void SetLearningRate(Scalar_t rate)    {fLearningRate = rate;}
 
  168    void SetBatchSize(Scalar_t rate)       {fBatchSize    = rate;}
 
  174 template <
typename Architecture_t>
 
  175 TGradientDescent<Architecture_t>::TGradientDescent()
 
  176    : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
 
  177      fMinimumError(std::numeric_limits<Scalar_t>::infinity())
 
  183 template <
typename Architecture_t>
 
  184 TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, 
size_t convergenceSteps, 
size_t testInterval)
 
  185    : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
 
  186      fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
 
  192 template<
typename Architecture_t>
 
  193 template <
typename Data_t, 
typename Net_t>
 
  194     auto TGradientDescent<Architecture_t>::Train(
const Data_t & trainingData,
 
  195                                                  size_t nTrainingSamples,
 
  196                                                  const Data_t & testData,
 
  205    TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
 
  208                                                    net.GetOutputWidth(), nThreads);
 
  209    auto testNet = net.CreateClone(nTestSamples);
 
  210    TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
 
  211                                                   testNet.GetBatchSize(),
 
  212                                                   testNet.GetInputWidth(),
 
  213                                                   net.GetOutputWidth());
 
  214    std::vector<Net_t> nets{};
 
  215    nets.reserve(nThreads);
 
  216    for (
size_t i = 0; i < nThreads; i++) {
 
  218        for (
size_t j = 0; j < net.GetDepth(); j++)
 
  220            auto &masterLayer = net.GetLayer(j);
 
  221            auto &layer = nets.back().GetLayer(j);
 
  222            Architecture_t::Copy(layer.GetWeights(),
 
  223                                 masterLayer.GetWeights());
 
  224            Architecture_t::Copy(layer.GetBiases(),
 
  225                                 masterLayer.GetBiases());
 
  229    size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
  230    std::vector<TBatch<Architecture_t>> batches{};
 
  231    batches.reserve(nThreads);
 
  234       for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
 
  235          trainLoader.Shuffle();
 
  236          for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
 
  238             for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
 
  239             Step(net, nets, batches);
 
  243       auto b = *testLoader.begin();
 
  244       auto inputMatrix = b.GetInput();
 
  245       auto outputMatrix = b.GetOutput();
 
  246       auto weightMatrix = b.GetWeights();
 
  247       fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
 
  249    } 
while (!HasConverged());
 
  251    return fMinimumError;
 
  255 template<
typename Architecture_t>
 
  256 template <
typename Data_t, 
typename Net_t>
 
  257 auto TGradientDescent<Architecture_t>::TrainMomentum(
const Data_t & trainingData,
 
  258                                                      size_t nTrainingSamples,
 
  259                                                      const Data_t & testData,
 
  269    TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
 
  272                                                    net.GetOutputWidth(), nThreads);
 
  273    auto testNet = net.CreateClone(net.GetBatchSize());
 
  274    TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
 
  275                                                   testNet.GetBatchSize(),
 
  276                                                   testNet.GetInputWidth(),
 
  277                                                   net.GetOutputWidth());
 
  279    net.InitializeGradients();
 
  280    std::vector<Net_t> nets{};
 
  281    nets.reserve(nThreads);
 
  282    for (
size_t i = 0; i < nThreads; i++) {
 
  284        for (
size_t j = 0; j < net.GetDepth(); j++)
 
  286            auto &masterLayer = net.GetLayer(j);
 
  287            auto &layer = nets.back().GetLayer(j);
 
  288            Architecture_t::Copy(layer.GetWeights(),
 
  289                                 masterLayer.GetWeights());
 
  290            Architecture_t::Copy(layer.GetBiases(),
 
  291                                 masterLayer.GetBiases());
 
  295    size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
  296    std::vector<TBatch<Architecture_t>> batches{};
 
  297    batches.reserve(nThreads);
 
  300       for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
 
  301          trainLoader.Shuffle();
 
  302          for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
 
  304             for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
 
  305             if (momentum != 0.0) {
 
  306                StepMomentum(net, nets, batches, momentum);
 
  308                Step(net, nets, batches);
 
  314       for (
size_t i = 0; i < batchesInEpoch; i++) {
 
  315          auto b = testLoader.GetBatch();
 
  316          auto inputMatrix = b.GetInput();
 
  317          auto outputMatrix = b.GetOutput();
 
  318          auto weightMatrix = b.GetWeights();
 
  319          fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
 
  321       fTestError /= (Double_t)batchesInEpoch;
 
  322    } 
while (!HasConverged());
 
  323    return fMinimumError;
 
  327 template <
typename Architecture_t>
 
  328 template <
typename Net_t>
 
  329 void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input, 
const Matrix_t &output,
 
  330                                                    const Matrix_t &weights)
 
  332    net.Forward(input, 
true);
 
  333    net.Backward(input, output, weights);
 
  335    for (
size_t i = 0; i < net.GetDepth(); i++)
 
  337       auto &layer = net.GetLayer(i);
 
  338       Architecture_t::ScaleAdd(layer.GetWeights(),
 
  339                                layer.GetWeightGradients(),
 
  341       Architecture_t::ScaleAdd(layer.GetBiases(),
 
  342                                layer.GetBiasGradients(),
 
  348 template <
typename Architecture_t>
 
  349 template <
typename Net_t>
 
  350 auto inline TGradientDescent<Architecture_t>::StepLoss(Net_t &net, Matrix_t &input, 
const Matrix_t &output,
 
  351                                                        const Matrix_t &weights) -> Scalar_t
 
  353    Scalar_t loss = net.Loss(input, output, weights);
 
  354    net.Backward(input, output);
 
  356    for (
size_t i = 0; i < net.GetDepth(); i++)
 
  358       auto &layer = net.GetLayer(i);
 
  359       Architecture_t::ScaleAdd(layer.GetWeights(),
 
  360                                layer.GetWeightGradients(),
 
  362       Architecture_t::ScaleAdd(layer.GetBiases(),
 
  363                                layer.GetBiasGradients(),
 
  370 template<
typename Architecture_t>
 
  371     template <
typename Net_t>
 
  372     void inline TGradientDescent<Architecture_t>::Step(
 
  374         std::vector<Net_t> & nets,
 
  375         std::vector<TBatch<Architecture_t>> & batches)
 
  377    typename Architecture_t::Matrix_t dummy(0,0);
 
  378    size_t depth = master.GetDepth();
 
  381    for (
size_t j = 0; j < nets.size(); j++) {
 
  382       nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  385    for (
size_t i = 1; i < depth; i++)
 
  387       for (
size_t j = 0; j < nets.size(); j++) {
 
  388          nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  392    for (
size_t j = 0; j < nets.size(); j++) {
 
  393       evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  394                                         batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  395                                         batches[j].GetWeights());
 
  398    for (
size_t i = depth - 1; i > 0; i--)
 
  400       for (
size_t j = 0; j < nets.size(); j++) {
 
  401          nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  402                                       nets[j].GetLayer(i-1).GetOutput(),
 
  403                                       nets[j].GetRegularization(),
 
  404                                       nets[j].GetWeightDecay());
 
  407    for (
size_t j = 0; j < nets.size(); j++) {
 
  408       nets[j].GetLayer(0).Backward(dummy,
 
  409                                    batches[j].GetInput(),
 
  410                                    nets[j].GetRegularization(),
 
  411                                    nets[j].GetWeightDecay());
 
  414    for (
size_t j = 0; j < nets.size(); j++) {
 
  415       for (
size_t i = 0; i < depth; i++)
 
  417          auto &masterLayer = master.GetLayer(i);
 
  418          auto &layer       = nets[j].GetLayer(i);
 
  419          Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  420                                   layer.GetWeightGradients(),
 
  422          Architecture_t::Copy(layer.GetWeights(),
 
  423                               masterLayer.GetWeights());
 
  424          Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  425                                   layer.GetBiasGradients(),
 
  427          Architecture_t::Copy(layer.GetBiases(),
 
  428                               masterLayer.GetBiases());
 
  434 template<
typename Architecture_t>
 
  435 template <
typename Net_t>
 
  436 void inline TGradientDescent<Architecture_t>::StepMomentum(
 
  438         std::vector<Net_t> & nets,
 
  439         std::vector<TBatch<Architecture_t>> & batches,
 
  442    typename Architecture_t::Matrix_t dummy(0,0);
 
  443    size_t depth = master.GetDepth();
 
  446    for (
size_t j = 0; j < nets.size(); j++) {
 
  447       nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  450    for (
size_t i = 1; i < depth; i++)
 
  452       for (
size_t j = 0; j < nets.size(); j++) {
 
  453          nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  457    for (
size_t j = 0; j < nets.size(); j++) {
 
  458       evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  459                                         batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  460                                         batches[j].GetWeights());
 
  463    for (
size_t i = depth - 1; i > 0; i--)
 
  465       for (
size_t j = 0; j < nets.size(); j++) {
 
  466          nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  467                                       nets[j].GetLayer(i-1).GetOutput(),
 
  468                                       nets[j].GetRegularization(),
 
  469                                       nets[j].GetWeightDecay());
 
  470          Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
 
  471                                   nets[j].GetLayer(i).GetWeightGradients(),
 
  472                                   - fLearningRate / momentum);
 
  473          Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
 
  474                                   nets[j].GetLayer(i).GetBiasGradients(),
 
  475                                   - fLearningRate / momentum);
 
  477       Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
 
  478                                master.GetLayer(i).GetWeightGradients(),
 
  480       Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
 
  481                                master.GetLayer(i).GetBiasGradients(),
 
  484    for (
size_t j = 0; j < nets.size(); j++) {
 
  485       nets[j].GetLayer(0).Backward(dummy,
 
  486                                    batches[j].GetInput(),
 
  487                                    nets[j].GetRegularization(),
 
  488                                    nets[j].GetWeightDecay());
 
  489       Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
 
  490                                nets[j].GetLayer(0).GetWeightGradients(),
 
  491                                - fLearningRate / momentum);
 
  492       Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
 
  493                                nets[j].GetLayer(0).GetBiasGradients(),
 
  494                                - fLearningRate / momentum);
 
  497    Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
 
  498                             master.GetLayer(0).GetWeightGradients(),
 
  500    Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
 
  501                             master.GetLayer(0).GetBiasGradients(),
 
  504    for (
size_t i = 0; i < depth; i++)
 
  506        auto &masterLayer = master.GetLayer(i);
 
  507        Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  508                                 masterLayer.GetWeightGradients(),
 
  510        Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  511                                 masterLayer.GetBiasGradients(),
 
  513        for (
size_t j = 0; j < nets.size(); j++) {
 
  514          auto &layer       = nets[j].GetLayer(i);
 
  515          Architecture_t::Copy(layer.GetWeights(),
 
  516                               masterLayer.GetWeights());
 
  517          Architecture_t::Copy(layer.GetBiases(),
 
  518                               masterLayer.GetBiases());
 
  524 template<
typename Architecture_t>
 
  525 template <
typename Net_t>
 
  526 void inline TGradientDescent<Architecture_t>::StepNesterov(
 
  528         std::vector<Net_t> & nets,
 
  529         std::vector<TBatch<Architecture_t>> & batches,
 
  532    typename Architecture_t::Matrix_t dummy(0,0);
 
  533    size_t depth = master.GetDepth();
 
  536    for (
size_t j = 0; j < nets.size(); j++) {
 
  537       nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  540    for (
size_t i = 1; i < depth; i++)
 
  542       for (
size_t j = 0; j < nets.size(); j++) {
 
  543          nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  548    for (
size_t j = 0; j < nets.size(); j++) {
 
  549       evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  550                                         batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  551                                         batches[j].GetWeights());
 
  555    for (
size_t i = depth - 1; i > 0; i--)
 
  557       for (
size_t j = 0; j < nets.size(); j++) {
 
  558          nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  559                                       nets[j].GetLayer(i-1).GetOutput(),
 
  560                                       nets[j].GetRegularization(),
 
  561                                       nets[j].GetWeightDecay());
 
  565    for (
size_t j = 0; j < nets.size(); j++) {
 
  566       nets[j].GetLayer(0).Backward(dummy,
 
  567                                    batches[j].GetInput(),
 
  568                                    nets[j].GetRegularization(),
 
  569                                    nets[j].GetWeightDecay());
 
  572    for (
size_t i = 0; i < depth; i++)
 
  574       auto &masterLayer = master.GetLayer(i);
 
  575       for (
size_t j = 0; j < nets.size(); j++) {
 
  576          auto &layer       = nets[j].GetLayer(i);
 
  577          Architecture_t::Copy(layer.GetWeights(),
 
  578                               masterLayer.GetWeights());
 
  579          Architecture_t::Copy(layer.GetBiases(),
 
  580                               masterLayer.GetBiases());
 
  581          Architecture_t::ScaleAdd(layer.GetWeights(),
 
  582                                   masterLayer.GetWeightGradients(),
 
  584          Architecture_t::ScaleAdd(layer.GetBiases(),
 
  585                                   masterLayer.GetBiasGradients(),
 
  588       for (
size_t j = 0; j < nets.size(); j++) {
 
  589          auto &layer       = nets[j].GetLayer(i);
 
  590          Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
 
  591                                   layer.GetWeightGradients(),
 
  592                                   - fLearningRate / momentum);
 
  593          Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
 
  594                                   layer.GetBiasGradients(),
 
  595                                   - fLearningRate / momentum);
 
  597       Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
 
  598                                masterLayer.GetWeightGradients(),
 
  600       Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
 
  601                                masterLayer.GetBiasGradients(),
 
  603       Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  604                                masterLayer.GetWeightGradients(),
 
  606       Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  607                                masterLayer.GetBiasGradients(),
 
  613 template<
typename Architecture_t>
 
  614 template <
typename Net_t>
 
  615 void inline TGradientDescent<Architecture_t>::StepReducedWeights(
 
  618     const Matrix_t &output)
 
  620    net.Forward(input, 
true);
 
  621    net.Backward(input, output);
 
  623    for (
size_t i = 0; i < net.GetDepth(); i++)
 
  625       auto &layer = net.GetLayer(i);
 
  626       Architecture_t::ScaleAdd(layer.GetWeights(),
 
  627                                layer.GetWeightGradients(),
 
  630          Architecture_t::ScaleAdd(layer.GetBiases(),
 
  631                                   layer.GetBiasGradients(),
 
  638 template <
typename Architecture_t>
 
  639 template <
typename Net_t>
 
  640 auto inline TGradientDescent<Architecture_t>::StepReducedWeightsLoss(Net_t &net, Matrix_t &input,
 
  641                                                                      const Matrix_t &output, 
const Matrix_t &weights)
 
  644    Scalar_t loss = net.Loss(input, output);
 
  645    fTrainingError = loss;
 
  646    net.Backward(input, output, weights);
 
  648    for (
size_t i = 0; i < net.GetDepth(); i++)
 
  650       auto &layer = net.GetLayer(i);
 
  651       Architecture_t::ScaleAdd(layer.GetWeights(),
 
  652                                layer.GetWeightGradients(),
 
  655          Architecture_t::ScaleAdd(layer.GetBiases(),
 
  656                                   layer.GetBiasGradients(),
 
  664 template<
typename Architecture_t>
 
  665 bool inline TGradientDescent<Architecture_t>::HasConverged()
 
  667    if (fTestError < fMinimumError * 0.999) {
 
  668       fConvergenceCount = 0;
 
  669       fMinimumError     = fTestError;
 
  674    return (fConvergenceCount >= fConvergenceSteps);
 
  678 template<
typename Architecture_t>
 
  679 bool inline TGradientDescent<Architecture_t>::HasConverged(Scalar_t testError)
 
  681    fTestError = testError;
 
  682    if (fTestError < fMinimumError * 0.999) {
 
  683       fConvergenceCount = 0;
 
  684       fMinimumError     = fTestError;
 
  686       fConvergenceCount += fTestInterval;
 
  688    return (fConvergenceCount >= fConvergenceSteps);