Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
DLMinimizers.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/cnn:$Id$
2 // Author: Vladimir Ilievski
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TDLGradientDescent *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Deel Learning Minimizers *
12  * *
13  * Authors (alphabetical): *
14  * Vladimir Ilievski <ilievski.vladimir@live.com> - CERN, Switzerland *
15  * *
16  * Copyright (c) 2005-2015: *
17  * CERN, Switzerland *
18  * U. of Victoria, Canada *
19  * MPI-K Heidelberg, Germany *
20  * U. of Bonn, Germany *
21  * *
22  * Redistribution and use in source and binary forms, with or without *
23  * modification, are permitted according to the terms listed in LICENSE *
24  * (http://tmva.sourceforge.net/LICENSE) *
25  **********************************************************************************/
26 
27 #ifndef TMVA_DNN_DLMINIMIZERS
28 #define TMVA_DNN_DLMINIMIZERS
29 
31 #include "TMVA/DNN/Functions.h"
32 #include "TMVA/DNN/DeepNet.h"
33 
34 #include <limits>
35 #include <iostream>
36 
37 namespace TMVA {
38 namespace DNN {
39 
40 /*** \class TDLGradientDescent
41  *
42  * Generic implementation of gradient descent minimization for the
43  * deep learning neural nets.
44  *
45  * The TDLGradientDescent class implements an architecture, input data and
46  * deep learning neural network type independent implementation of the gradient
47  * descent minimization algorithm.
48  *
49 * This is provided by the Step(...), StepMomentum(...) and
50  * StepNesterov(...) functions that perform a single minimization step.
51  *
52  * The main training characteristics are defined by the provided learning rate,
53  * the test interval, and the convergence steps required for convergence. The
54  * test interval defines how often the error on the validation set is computed,
55  * and the values with which the step counter is increased each time the
56  * HasConverged() member function is called. A convergence step is defined as
57  * a step in which the test error is NOT less than 0.999 times the current
58  * minimal test error that has been reached. If between two subsequent calls
59  * to HasConverged(Double_t) the test error has not been sufficiently reduced
60  * it is assumed that a number of convergence steps equal to the test interval
61  * has been performed.
62  */
63 
64 template <typename Architecture_t>
65 class TDLGradientDescent {
66 public:
67  using DeepNet_t = TDeepNet<Architecture_t>;
68  using Scalar_t = typename Architecture_t::Scalar_t;
69  using Matrix_t = typename Architecture_t::Matrix_t;
70 
71 private:
72  size_t fBatchSize; ///< Batch size to use for the training.
73  size_t fStepCount; ///< Number of steps performed in the current training session
74  size_t fConvergenceSteps; ///< Number of training epochs without considerable
75  ///< decrease in the test error for convergence.
76  size_t fConvergenceCount; ///< Current number of training epochs without
77  ///< considerable decrease in the test error.
78  size_t fTestInterval; ///< Interval for the computation of the test error.
79  Scalar_t fTrainingError; ///< Holds the most recently computed training loss.
80  Scalar_t fTestError; ///< Holds the most recently computed test loss.
81  Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
82  Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
83  ///< during the current traning session.
84 
85 public:
86  TDLGradientDescent();
87  TDLGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval);
88 
89  /** Reset minimizer object to default state. */
90  void Reset()
91  {
92  fMinimumError = std::numeric_limits<Scalar_t>::infinity();
93  fConvergenceCount = 0;
94  fStepCount = 0;
95  };
96 
97  /** Perform a single optimization step on a given batch. Propagates the input
98  matrix foward through the net, evaluates the loss and propagates the gradients
99  backward through the net. The computed gradients are scaled by the learning
100  rate \f$\alpha\f$ and subtracted from the weights and bias values of each
101  layer. */
102  void Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
103 
104  /** Does not evaluate the loss and therefore not trigger a possible synchronization
105  * with the device. Trains the weights of each layer, but only the bias terms of
106  * the first layer for compatibility with the previous implementation. */
107  void StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
108  const Matrix_t &weights);
109 
110  /** Same as Step(...) but also evaluate the loss on the given training data.
111  * Note that this requires synchronization between host and device. */
112  Scalar_t StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
113 
114  /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
115  * synchronization with the device. */
116  Scalar_t StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
117  const Matrix_t &weights);
118 
119  /** Perform multiple optimization steps simultaneously. Performs the
120  * backprop algorithm on the input batches given in \p batches on
121  * the neural networks given in \p nets. The forward and backward propagation
122  * steps are executed in an interleaving manner in order to exploit potential
123  * batch-level parallelism for asynchronous device calls.
124  */
125  void Step(DeepNet_t &master, std::vector<DeepNet_t> &nets, std::vector<TTensorBatch<Architecture_t>> &batches);
126 
127  /** Same as the Step(...) method for multiple batches but uses momentum. */
128  void StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
129  std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
130 
131  /** Same as the Step(...) method for multiple batches but uses Nesterov
132  * momentum. */
133  void StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
134  std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
135 
136  /** Increases the minimization step counter by the test error evaluation
137  * period and uses the current internal value of the test error to
138  * determine if the minimization has converged. */
139  bool HasConverged();
140 
141  /** Increases the minimization step counter by the test error evaluation
142  * period and uses the provided test error value to determine if the
143  * minimization has converged. */
144  bool HasConverged(Scalar_t testError);
145 
146  /** Getters */
147  size_t GetConvergenceCount() const { return fConvergenceCount; }
148  size_t GetConvergenceSteps() const { return fConvergenceSteps; }
149  Scalar_t GetTrainingError() const { return fTrainingError; }
150  Scalar_t GetTestError() const { return fTestError; }
151  size_t GetTestInterval() const { return fTestInterval; }
152 
153  /** Setters */
154  void SetConvergenceSteps(size_t steps) { fConvergenceSteps = steps; }
155  void SetTestInterval(size_t interval) { fTestInterval = interval; }
156  void SetLearningRate(Scalar_t rate) { fLearningRate = rate; }
157  void SetBatchSize(Scalar_t rate) { fBatchSize = rate; }
158 };
159 
160 //
161 // Implementation
162 //______________________________________________________________________________
163 template <typename Architecture_t>
164 TDLGradientDescent<Architecture_t>::TDLGradientDescent()
165  : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
166  fMinimumError(std::numeric_limits<Scalar_t>::infinity())
167 {
168  // Nothing to do here.
169 }
170 
171 //______________________________________________________________________________
172 template <typename Architecture_t>
173 TDLGradientDescent<Architecture_t>::TDLGradientDescent(Scalar_t learningRate, size_t convergenceSteps,
174  size_t testInterval)
175  : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
176  fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
177 {
178  // Nothing to do here.
179 }
180 
181 //______________________________________________________________________________
182 template <typename Architecture_t>
183 void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
184  const Matrix_t &weights)
185 {
186  // Make forward and backward pass and update the net afterwards
187  deepNet.Forward(input, true);
188  deepNet.Backward(input, output, weights);
189  deepNet.Update(fLearningRate);
190 }
191 
192 //______________________________________________________________________________
193 template <typename Architecture_t>
194 void TDLGradientDescent<Architecture_t>::StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
195  const Matrix_t &output, const Matrix_t &weights)
196 {
197  // Make forward and backward pass and update the net afterwards
198  deepNet.Forward(input, true);
199  deepNet.Backward(input, output, weights);
200 
201  for (size_t i = 0; i < deepNet.GetDepth(); i++) {
202  auto *layer = deepNet.GetLayerAt(i);
203 
204  layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
205  if (i == 0) {
206  layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
207  }
208  }
209 }
210 
211 //______________________________________________________________________________
212 template <typename Architecture_t>
213 auto TDLGradientDescent<Architecture_t>::StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
214  const Matrix_t &output, const Matrix_t &weights) -> Scalar_t
215 {
216  Scalar_t loss = deepNet.Loss(input, output);
217  deepNet.Backward(input, output, weights);
218  deepNet.Update(fLearningRate);
219 
220  return loss;
221 }
222 
223 //______________________________________________________________________________
224 template <typename Architecture_t>
225 auto TDLGradientDescent<Architecture_t>::StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
226  const Matrix_t &output, const Matrix_t &weights)
227  -> Scalar_t
228 {
229  Scalar_t loss = deepNet.Loss(input, output);
230  fTrainingError = loss;
231  deepNet.Backward(input, output, weights);
232 
233  for (size_t i = 0; i < deepNet.GetDepth(); i++) {
234  auto *layer = deepNet.GetLayerAt(i);
235 
236  layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
237  if (i == 0) {
238  layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
239  }
240  }
241 
242  return loss;
243 }
244 
245 //______________________________________________________________________________
246 template <typename Architecture_t>
247 void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &master, std::vector<DeepNet_t> &nets,
248  std::vector<TTensorBatch<Architecture_t>> &batches)
249 {
250 
251  master.ParallelForward(nets, batches);
252  master.ParallelBackward(nets, batches, fLearningRate);
253 }
254 
255 //______________________________________________________________________________
256 template <typename Architecture_t>
257 void TDLGradientDescent<Architecture_t>::StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
258  std::vector<TTensorBatch<Architecture_t>> &batches,
259  Scalar_t momentum)
260 {
261  master.ParallelForward(nets, batches);
262  master.ParallelBackwardMomentum(nets, batches, fLearningRate, momentum);
263 }
264 
265 //______________________________________________________________________________
266 template <typename Architecture_t>
267 void TDLGradientDescent<Architecture_t>::StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
268  std::vector<TTensorBatch<Architecture_t>> &batches,
269  Scalar_t momentum)
270 {
271  master.ParallelForward(nets, batches);
272  master.ParallelBackwardNestorov(nets, batches, fLearningRate, momentum);
273 }
274 
275 //______________________________________________________________________________
276 template <typename Architecture_t>
277 bool TDLGradientDescent<Architecture_t>::HasConverged()
278 {
279  if (fTestError < fMinimumError * 0.999) {
280  fConvergenceCount = 0;
281  fMinimumError = fTestError;
282  } else {
283  fConvergenceCount++;
284  }
285 
286  return (fConvergenceCount >= fConvergenceSteps);
287 }
288 
289 //______________________________________________________________________________
290 template <typename Architecture_t>
291 bool TDLGradientDescent<Architecture_t>::HasConverged(Scalar_t testError)
292 {
293  fTestError = testError;
294  if (fTestError < fMinimumError * 0.999) {
295  fConvergenceCount = 0;
296  fMinimumError = fTestError;
297  } else {
298  fConvergenceCount += fTestInterval;
299  }
300  return (fConvergenceCount >= fConvergenceSteps);
301 }
302 
303 } // namespace DNN
304 } // namespace TMVA
305 
306 #endif