Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Layer.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 20/06/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 // Contains Layer and SharedLayer classes, that represent layers in //
14 // neural networks. //
15 //////////////////////////////////////////////////////////////////////
16 
17 #ifndef TMVA_DNN_LAYER
18 #define TMVA_DNN_LAYER
19 
20 #include <iostream>
21 
22 #include "TMatrix.h"
23 #include "Functions.h"
24 
25 namespace TMVA
26 {
27 namespace DNN
28 {
29 
30 //______________________________________________________________________________
31 //
32 // The Layer Class
33 //______________________________________________________________________________
34 
35 /** \class TLayer
36 
37  Generic layer class.
38 
39  This generic layer class represents a layer of a neural network with
40  a given width n and activation function f. The activation
41  function of each layer is given by \f$\mathbf{u} =
42  \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$.
43 
44  In addition to the weight and bias matrices, each layer allocates memory
45  for its activations and the corresponding first partial fDerivatives of
46  the activation function as well as the gradients of the fWeights and fBiases.
47 
48  The layer provides member functions for the forward propagation of
49  activations through the given layer.
50 */
51 template<typename Architecture_t>
52  class TLayer
53 {
54 
55 public:
56  using Scalar_t = typename Architecture_t::Scalar_t;
57  using Matrix_t = typename Architecture_t::Matrix_t;
58  using Tensor_t = typename Architecture_t::Tensor_t;
59 
60 
61 private:
62 
63  size_t fBatchSize; ///< Batch size used for training and evaluation.
64  size_t fInputWidth; ///< Number of neurons of the previous layer.
65  size_t fWidth; ///< Number of neurons of this layer.
66 
67  Scalar_t fDropoutProbability; ///< Probability that an input is active.
68 
69  Matrix_t fWeights; ///< The fWeights of this layer.
70  Matrix_t fBiases; ///< The bias values of this layer.
71  Matrix_t fOutput; ///< Activations of this layer.
72  Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer.
73  Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer.
74  Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer.
75  Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
76 
77  EActivationFunction fF; ///< Activation function of the layer.
78 
79 public:
80 
81  TLayer(size_t BatchSize,
82  size_t InputWidth,
83  size_t Width,
84  EActivationFunction f,
85  Scalar_t dropoutProbability);
86  TLayer(const TLayer &);
87 
88  /*! Initialize fWeights according to the given initialization
89  * method. */
90  void Initialize(EInitialization m);
91  /*! Compute activation of the layer for the given input. The input
92  * must be in matrix form with the different rows corresponding to
93  * different events in the batch. Computes activations as well as
94  * the first partial derivative of the activation function at those
95  * activations. */
96  void inline Forward(Matrix_t & input, bool applyDropout = false);
97  /*! Compute weight, bias and activation gradients. Uses the precomputed
98  * first partial derviatives of the activation function computed during
99  * forward propagation and modifies them. Must only be called directly
100  * a the corresponding call to Forward(...). */
101  void inline Backward(Matrix_t & gradients_backward,
102  const Matrix_t & activations_backward,
103  ERegularization r,
104  Scalar_t weightDecay);
105 
106  void Print() const;
107 
108  size_t GetBatchSize() const {return fBatchSize;}
109  size_t GetInputWidth() const {return fInputWidth;}
110  size_t GetWidth() const {return fWidth;}
111  size_t GetDropoutProbability() const {return fDropoutProbability;}
112 
113  void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
114 
115  EActivationFunction GetActivationFunction() const {return fF;}
116 
117  Matrix_t & GetOutput() {return fOutput;}
118  const Matrix_t & GetOutput() const {return fOutput;}
119  Matrix_t & GetWeights() {return fWeights;}
120  const Matrix_t & GetWeights() const {return fWeights;}
121  Matrix_t & GetBiases() {return fBiases;}
122  const Matrix_t & GetBiases() const {return fBiases;}
123  Matrix_t & GetActivationGradients() {return fActivationGradients;}
124  const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
125  Matrix_t & GetBiasGradients() {return fBiasGradients;}
126  const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
127  Matrix_t & GetWeightGradients() {return fWeightGradients;}
128  const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
129 
130 };
131 
132 //______________________________________________________________________________
133 //
134 // The Shared Layer Class
135 //______________________________________________________________________________
136 
137 /** \class TSharedLayer
138 
139  Layer class width shared weight and bias layers.
140 
141  Like the Layer class only that weight matrices are shared between
142  different instances of the net, which can be used to implement
143  multithreading 'Hogwild' style.
144 */
145 
146 template<typename Architecture_t>
147 class TSharedLayer
148 {
149 
150 public:
151 
152  using Scalar_t = typename Architecture_t::Scalar_t;
153  using Matrix_t = typename Architecture_t::Matrix_t;
154  using Tensor_t = typename Architecture_t::Tensor_t;
155 
156 
157 private:
158 
159  size_t fBatchSize; ///< Batch size used for training and evaluation.
160  size_t fInputWidth; ///< Number of neurons of the previous layer.
161  size_t fWidth; ///< Number of neurons of this layer.
162 
163  Scalar_t fDropoutProbability; ///< Probability that an input is active.
164 
165  Matrix_t & fWeights; ///< Reference to the weight matrix of this layer.
166  Matrix_t & fBiases; ///< Reference to the bias vectors of this layer.
167  Matrix_t fOutput; ///< Activations of this layer.
168  Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer.
169  Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer.
170  Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer.
171  Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
172 
173  EActivationFunction fF; ///< Activation function of the layer.
174 
175 public:
176 
177  TSharedLayer(size_t fBatchSize,
178  TLayer<Architecture_t> & layer);
179  TSharedLayer(const TSharedLayer & layer);
180 
181  /*! Compute activation of the layer for the given input. The input
182  * must be in matrix form with the different rows corresponding to
183  * different events in the batch. Computes activations as well as
184  * the first partial derivative of the activation function at those
185  * activations. */
186  void inline Forward(Matrix_t & input, bool applyDropout = false);
187  /*! Compute weight, bias and activation gradients. Uses the precomputed
188  * first partial derviatives of the activation function computed during
189  * forward propagation and modifies them. Must only be called directly
190  * a the corresponding call to Forward(...). */
191  void inline Backward(Matrix_t & gradients_backward,
192  const Matrix_t & activations_backward,
193  ERegularization r,
194  Scalar_t weightDecay);
195 
196  void Print() const;
197 
198  size_t GetBatchSize() const {return fBatchSize;}
199  size_t GetInputWidth() const {return fInputWidth;}
200  size_t GetWidth() const {return fWidth;}
201  size_t GetDropoutProbability() const {return fDropoutProbability;}
202 
203  void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
204 
205  EActivationFunction GetActivationFunction() const {return fF;}
206 
207  Matrix_t & GetOutput() {return fOutput;}
208  const Matrix_t & GetOutput() const {return fOutput;}
209  Matrix_t & GetWeights() const {return fWeights;}
210  Matrix_t & GetBiases() {return fBiases;}
211  const Matrix_t & GetBiases() const {return fBiases;}
212  Matrix_t & GetActivationGradients() {return fActivationGradients;}
213  const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
214  Matrix_t & GetBiasGradients() {return fBiasGradients;}
215  const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
216  Matrix_t & GetWeightGradients() {return fWeightGradients;}
217  const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
218 
219 };
220 
221 //______________________________________________________________________________
222 //
223 // The Layer Class - Implementation
224 //______________________________________________________________________________
225 
226 template<typename Architecture_t>
227  TLayer<Architecture_t>::TLayer(size_t batchSize,
228  size_t inputWidth,
229  size_t width,
230  EActivationFunction f,
231  Scalar_t dropoutProbability)
232  : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width),
233  fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth),
234  fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width),
235  fWeightGradients(width, fInputWidth), fBiasGradients(width, 1),
236  fActivationGradients(fBatchSize, width), fF(f)
237 {
238  // Nothing to do here.
239 }
240 
241 //______________________________________________________________________________
242 template<typename Architecture_t>
243 TLayer<Architecture_t>::TLayer(const TLayer &layer)
244  : fBatchSize(layer.fBatchSize), fInputWidth(layer.fInputWidth),
245  fWidth(layer.fWidth), fDropoutProbability(layer.fDropoutProbability),
246  fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1),
247  fOutput(layer.fBatchSize, layer.fWidth),
248  fDerivatives(layer.fBatchSize, layer.fWidth),
249  fWeightGradients(layer.fWidth, layer.fInputWidth),
250  fBiasGradients(layer.fWidth, 1),
251  fActivationGradients(layer.fBatchSize, layer.fWidth),
252  fF(layer.fF)
253 {
254  Architecture_t::Copy(fWeights, layer.GetWeights());
255  Architecture_t::Copy(fBiases, layer.GetBiases());
256 }
257 
258 //______________________________________________________________________________
259 template<typename Architecture_t>
260 auto TLayer<Architecture_t>::Initialize(EInitialization m)
261 -> void
262 {
263  initialize<Architecture_t>(fWeights, m);
264  initialize<Architecture_t>(fBiases, EInitialization::kZero);
265 }
266 
267 //______________________________________________________________________________
268 template<typename Architecture_t>
269 auto inline TLayer<Architecture_t>::Forward(Matrix_t & input,
270  bool applyDropout)
271 -> void
272 {
273  if (applyDropout && (fDropoutProbability != 1.0)) {
274  Architecture_t::DropoutForward(input, fDropoutProbability);
275  }
276  Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
277  Architecture_t::AddRowWise(fOutput, fBiases);
278  Tensor_t tOutput(fOutput);
279  Tensor_t tDerivatives(fDerivatives);
280  evaluateDerivative<Architecture_t>(tDerivatives, fF, tOutput);
281 
282  evaluate<Architecture_t>(tOutput, fF);
283 }
284 
285 //______________________________________________________________________________
286 template<typename Architecture_t>
287 auto TLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
288  const Matrix_t & activations_backward,
289  ERegularization r,
290  Scalar_t weightDecay)
291 -> void
292 {
293 
294  Tensor_t tGradBw(gradients_backward);
295  Tensor_t tActBw(activations_backward);
296  Tensor_t tActGrad(fActivationGradients);
297  Tensor_t tDeriv(fDerivatives);
298 
299  Architecture_t::Hadamard( tDeriv, tActGrad);
300  Architecture_t::Backward( tGradBw,
301  fWeightGradients,
302  fBiasGradients,
303  tDeriv,
304  tActGrad,
305  fWeights,
306  tActBw);
307  addRegularizationGradients<Architecture_t>(fWeightGradients,
308  fWeights,
309  weightDecay, r);
310 }
311 
312 //______________________________________________________________________________
313 template<typename Architecture_t>
314  void TLayer<Architecture_t>::Print() const
315 {
316  std::cout << "Width = " << fWeights.GetNrows();
317  std::cout << ", Activation Function = ";
318  std::cout << static_cast<int>(fF) << std::endl;
319 }
320 
321 //______________________________________________________________________________
322 //
323 // The Shared Layer Class - Implementation
324 //______________________________________________________________________________
325 
326 //______________________________________________________________________________
327 template<typename Architecture_t>
328 TSharedLayer<Architecture_t>::TSharedLayer(size_t BatchSize,
329  TLayer<Architecture_t> &layer)
330 : fBatchSize(BatchSize),
331 fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
332 fDropoutProbability(layer.GetDropoutProbability()),
333 fWeights(layer.GetWeights()), fBiases(layer.GetBiases()),
334 fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth),
335 fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
336 fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction())
337 {
338  // Nothing to do here.
339 }
340 
341 //______________________________________________________________________________
342 template<typename Architecture_t>
343 TSharedLayer<Architecture_t>::TSharedLayer(const TSharedLayer &layer)
344  : fBatchSize(layer.fBatchSize),
345  fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
346  fDropoutProbability(layer.fDropoutProbability), fWeights(layer.fWeights),
347  fBiases(layer.fBiases), fOutput(layer.fBatchSize, fWidth),
348  fDerivatives(layer.fBatchSize, fWidth), fWeightGradients(fWidth, fInputWidth),
349  fBiasGradients(fWidth, 1), fActivationGradients(layer.fBatchSize, fWidth),
350  fF(layer.fF)
351 {
352 }
353 
354 //______________________________________________________________________________
355 template<typename Architecture_t>
356 auto inline TSharedLayer<Architecture_t>::Forward(Matrix_t & input,
357  bool applyDropout)
358 -> void
359 {
360  if (applyDropout && (fDropoutProbability != 1.0)) {
361  Architecture_t::DropoutForward(input, fDropoutProbability);
362  }
363  Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
364  Architecture_t::AddRowWise(fOutput, fBiases);
365  Tensor_t tOutput(fOutput);
366  Tensor_t tDerivatives(fDerivatives);
367  evaluateDerivative<Architecture_t>(tDerivatives, fF, tOutput);
368  evaluate<Architecture_t>(tOutput, fF);
369 }
370 
371 //______________________________________________________________________________
372 template<typename Architecture_t>
373 auto inline TSharedLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
374  const Matrix_t & activations_backward,
375  ERegularization r,
376  Scalar_t weightDecay)
377 -> void
378 {
379  Architecture_t::Backward(gradients_backward,
380  fWeightGradients,
381  fBiasGradients,
382  fDerivatives,
383  fActivationGradients,
384  fWeights,
385  activations_backward);
386  addRegularizationGradients<Architecture_t>(fWeightGradients,
387  fWeights,
388  weightDecay, r);
389 }
390 
391 //______________________________________________________________________________
392 template<typename Architecture_t>
393 void TSharedLayer<Architecture_t>::Print() const
394 {
395  std::cout << "Width = " << fWeights.GetNrows();
396  std::cout << ", Activation Function = ";
397  std::cout << static_cast<int>(fF) << std::endl;
398 }
399 
400 } // namespace DNN
401 } // namespace TMVA
402 
403 #endif