Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RNNLayer.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn/rnn:$Id$
2 // Author: Saurav Shekhar 19/07/17
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : BasicRNNLayer *
8  * *
9  * Description: *
10  * NeuralNetwork *
11  * *
12  * Authors (alphabetical): *
13  * Saurav Shekhar <sauravshekhar01@gmail.com> - ETH Zurich, Switzerland *
14  * *
15  * Copyright (c) 2005-2015: *
16  * All rights reserved. *
17  * CERN, Switzerland *
18  * *
19  * For the licensing terms see $ROOTSYS/LICENSE. *
20  * For the list of contributors see $ROOTSYS/README/CREDITS. *
21  **********************************************************************************/
22 
23 //#pragma once
24 
25 //////////////////////////////////////////////////////////////////////
26 // <Description> //
27 //////////////////////////////////////////////////////////////////////
28 
29 #ifndef TMVA_DNN_RNN_LAYER
30 #define TMVA_DNN_RNN_LAYER
31 
32 #include <cmath>
33 #include <iostream>
34 #include <vector>
35 
36 #include "TMatrix.h"
37 #include "TMVA/DNN/Functions.h"
38 
39 namespace TMVA
40 {
41 namespace DNN
42 {
43 namespace RNN
44 {
45 
46 //______________________________________________________________________________
47 //
48 // Basic RNN Layer
49 //______________________________________________________________________________
50 
51 /** \class BasicRNNLayer
52  Generic implementation
53 */
54 template<typename Architecture_t>
55  class TBasicRNNLayer : public VGeneralLayer<Architecture_t>
56 {
57 
58 public:
59 
60  using Tensor_t = typename Architecture_t::Tensor_t;
61  using Matrix_t = typename Architecture_t::Matrix_t;
62  using Scalar_t = typename Architecture_t::Scalar_t;
63 
64 private:
65 
66  size_t fTimeSteps; ///< Timesteps for RNN
67  size_t fStateSize; ///< Hidden state size of RNN
68  bool fRememberState; ///< Remember state in next pass
69 
70  DNN::EActivationFunction fF; ///< Activation function of the hidden state
71 
72  Matrix_t fState; ///< Hidden State
73  Matrix_t &fWeightsInput; ///< Input weights, fWeights[0]
74  Matrix_t &fWeightsState; ///< Prev state weights, fWeights[1]
75  Matrix_t &fBiases; ///< Biases
76 
77  Tensor_t fDerivatives; ///< First fDerivatives of the activations
78  Matrix_t &fWeightInputGradients; ///< Gradients w.r.t. the input weights
79  Matrix_t &fWeightStateGradients; ///< Gradients w.r.t. the recurring weights
80  Matrix_t &fBiasGradients; ///< Gradients w.r.t. the bias values
81 
82  typename Architecture_t::ActivationDescriptor_t fActivationDesc;
83 
84 public:
85 
86  /** Constructor */
87  TBasicRNNLayer(size_t batchSize, size_t stateSize, size_t inputSize,
88  size_t timeSteps, bool rememberState = false,
89  DNN::EActivationFunction f = DNN::EActivationFunction::kTanh,
90  bool training = true, DNN::EInitialization fA = DNN::EInitialization::kZero);
91 
92  /** Copy Constructor */
93  TBasicRNNLayer(const TBasicRNNLayer &);
94 
95  /*! Initialize the weights according to the given initialization
96  ** method. */
97  //void Initialize(DNN::EInitialization m);
98 
99  /*! Initialize the state
100  ** method. */
101  void InitState(DNN::EInitialization m = DNN::EInitialization::kZero);
102 
103  /*! Compute and return the next state with given input
104  * matrix */
105  void Forward(Tensor_t &input, bool isTraining = true);
106 
107  /*! Forward for a single cell (time unit) */
108  void CellForward(const Matrix_t &input, Matrix_t & dF);
109 
110  /*! Backpropagates the error. Must only be called directly at the corresponding
111  * call to Forward(...). */
112  void Backward(Tensor_t &gradients_backward,
113  const Tensor_t &activations_backward);
114 
115  /* Updates weights and biases, given the learning rate */
116  void Update(const Scalar_t learningRate);
117 
118  /*! Backward for a single time unit
119  * a the corresponding call to Forward(...). */
120  inline Matrix_t & CellBackward(Matrix_t & state_gradients_backward,
121  const Matrix_t & precStateActivations,
122  const Matrix_t & input, Matrix_t & input_gradient, Matrix_t &dF);
123 
124  /** Prints the info about the layer */
125  void Print() const;
126 
127  /*! Writes the information and the weights about the layer in an XML node. */
128  virtual void AddWeightsXMLTo(void *parent);
129 
130  /*! Read the information and the weights about the layer from XML node. */
131  virtual void ReadWeightsFromXML(void *parent);
132 
133 
134  /** Getters */
135  size_t GetTimeSteps() const { return fTimeSteps; }
136  size_t GetStateSize() const { return fStateSize; }
137  size_t GetInputSize() const { return this->GetInputWidth(); }
138  inline bool IsRememberState() const {return fRememberState;}
139  inline DNN::EActivationFunction GetActivationFunction() const {return fF;}
140  Matrix_t & GetState() {return fState;}
141  const Matrix_t & GetState() const {return fState;}
142  Matrix_t & GetWeightsInput() {return fWeightsInput;}
143  const Matrix_t & GetWeightsInput() const {return fWeightsInput;}
144  Matrix_t & GetWeightsState() {return fWeightsState;}
145  const Matrix_t & GetWeightsState() const {return fWeightsState;}
146  Tensor_t & GetDerivatives() {return fDerivatives;}
147  const Tensor_t & GetDerivatives() const {return fDerivatives;}
148  // Matrix_t &GetDerivativesAt(size_t i) { return fDerivatives[i]; }
149  // const Matrix_t &GetDerivativesAt(size_t i) const { return fDerivatives[i]; }
150 
151  Matrix_t & GetBiasesState() {return fBiases;}
152  const Matrix_t & GetBiasesState() const {return fBiases;}
153  Matrix_t & GetBiasStateGradients() {return fBiasGradients;}
154  const Matrix_t & GetBiasStateGradients() const {return fBiasGradients;}
155  Matrix_t & GetWeightInputGradients() {return fWeightInputGradients;}
156  const Matrix_t & GetWeightInputGradients() const {return fWeightInputGradients;}
157  Matrix_t & GetWeightStateGradients() {return fWeightStateGradients;}
158  const Matrix_t & GetWeightStateGradients() const {return fWeightStateGradients;}
159 };
160 
161 //______________________________________________________________________________
162 //
163 // BasicRNNLayer Implementation
164 //______________________________________________________________________________
165 template <typename Architecture_t>
166 TBasicRNNLayer<Architecture_t>::TBasicRNNLayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps,
167  bool rememberState, DNN::EActivationFunction f, bool /*training*/,
168  DNN::EInitialization fA)
169  // TODO inputDepth and outputDepth changed to batchSize??
170  : VGeneralLayer<Architecture_t>(batchSize, 1, timeSteps, inputSize, 1, timeSteps, stateSize, 2,
171  {stateSize, stateSize}, {inputSize, stateSize}, 1, {stateSize}, {1}, batchSize,
172  timeSteps, stateSize, fA),
173  fTimeSteps(timeSteps),
174  fStateSize(stateSize),
175  fRememberState(rememberState),
176  fF(f),
177  fState(batchSize, stateSize),
178  fWeightsInput(this->GetWeightsAt(0)),
179  fWeightsState(this->GetWeightsAt(1)),
180  fBiases(this->GetBiasesAt(0)),
181  fDerivatives( timeSteps, batchSize, stateSize), // create tensor time x bs x S
182  fWeightInputGradients(this->GetWeightGradientsAt(0)),
183  fWeightStateGradients(this->GetWeightGradientsAt(1)),
184  fBiasGradients(this->GetBiasGradientsAt(0))
185 {
186  // Nothing
187 }
188 
189 //______________________________________________________________________________
190 template <typename Architecture_t>
191 TBasicRNNLayer<Architecture_t>::TBasicRNNLayer(const TBasicRNNLayer &layer)
192  : VGeneralLayer<Architecture_t>(layer), fTimeSteps(layer.fTimeSteps), fStateSize(layer.fStateSize),
193  fRememberState(layer.fRememberState), fF(layer.GetActivationFunction()),
194  fState(layer.GetBatchSize(), layer.GetStateSize()), fWeightsInput(this->GetWeightsAt(0)),
195  fWeightsState(this->GetWeightsAt(1)), fBiases(this->GetBiasesAt(0)),
196  fDerivatives( layer.GetDerivatives().GetShape() ), fWeightInputGradients(this->GetWeightGradientsAt(0)),
197  fWeightStateGradients(this->GetWeightGradientsAt(1)), fBiasGradients(this->GetBiasGradientsAt(0))
198 {
199 
200  Architecture_t::Copy(fDerivatives, layer.GetDerivatives() );
201 
202  // Gradient matrices not copied
203  Architecture_t::Copy(fState, layer.GetState());
204 }
205 
206 //______________________________________________________________________________
207 //template<typename Architecture_t>
208 //auto TBasicRNNLayer<Architecture_t>::Initialize(DNN::EInitialization m)
209 //-> void
210 //{
211 // DNN::initialize<Architecture_t>(fWeightsInput, m);
212 // DNN::initialize<Architecture_t>(fWeightsState, m);
213 // DNN::initialize<Architecture_t>(fBiases, DNN::EInitialization::kZero);
214 //}
215 
216 //______________________________________________________________________________
217 template <typename Architecture_t>
218 auto TBasicRNNLayer<Architecture_t>::InitState(DNN::EInitialization /*m*/) -> void
219 {
220  DNN::initialize<Architecture_t>(this->GetState(), DNN::EInitialization::kZero);
221 
222  Architecture_t::InitializeActivationDescriptor(fActivationDesc,this->GetActivationFunction());
223 }
224 
225 //______________________________________________________________________________
226 template<typename Architecture_t>
227 auto TBasicRNNLayer<Architecture_t>::Print() const
228 -> void
229 {
230  std::cout << " RECURRENT Layer: \t ";
231  std::cout << " (NInput = " << this->GetInputSize(); // input size
232  std::cout << ", NState = " << this->GetStateSize(); // hidden state size
233  std::cout << ", NTime = " << this->GetTimeSteps() << " )"; // time size
234  std::cout << "\tOutput = ( " << this->GetOutput().GetFirstSize() << " , " << this->GetOutput().GetHSize() << " , " << this->GetOutput().GetWSize() << " )\n";
235 }
236 
237 template <typename Architecture_t>
238 auto debugMatrix(const typename Architecture_t::Matrix_t &A, const std::string name = "matrix")
239 -> void
240 {
241  std::cout << name << "\n";
242  for (size_t i = 0; i < A.GetNrows(); ++i) {
243  for (size_t j = 0; j < A.GetNcols(); ++j) {
244  std::cout << A(i, j) << " ";
245  }
246  std::cout << "\n";
247  }
248  std::cout << "********\n";
249 }
250 
251 
252 //______________________________________________________________________________
253 template <typename Architecture_t>
254 auto inline TBasicRNNLayer<Architecture_t>::Forward(Tensor_t &input, bool /*isTraining*/) // B x T x D
255  -> void
256 {
257  // D : input size
258  // H : state size
259  // T : time size
260  // B : batch size
261 
262  Tensor_t arrInput (fTimeSteps, this->GetBatchSize(), this->GetInputWidth() );
263  //for (size_t t = 0; t < fTimeSteps; ++t) arrInput.emplace_back(this->GetBatchSize(), this->GetInputWidth()); // T x B x D
264  Architecture_t::Rearrange(arrInput, input);
265  Tensor_t arrOutput ( fTimeSteps, this->GetBatchSize(), fStateSize);
266  //for (size_t t = 0; t < fTimeSteps;++t) arrOutput.emplace_back(this->GetBatchSize(), fStateSize); // T x B x H
267 
268  if (!this->fRememberState) InitState(DNN::EInitialization::kZero);
269  for (size_t t = 0; t < fTimeSteps; ++t) {
270  Matrix_t arrInput_m = arrInput.At(t).GetMatrix();
271  Matrix_t df_m = fDerivatives.At(t).GetMatrix();
272  CellForward(arrInput_m, df_m );
273  Matrix_t arrOutput_m = arrOutput.At(t).GetMatrix();
274  Architecture_t::Copy(arrOutput_m, fState);
275  }
276  Architecture_t::Rearrange(this->GetOutput(), arrOutput); // B x T x D
277 }
278 
279 //______________________________________________________________________________
280 template <typename Architecture_t>
281 auto inline TBasicRNNLayer<Architecture_t>::CellForward(const Matrix_t &input, Matrix_t &dF)
282 -> void
283 {
284  // State = act(W_input . input + W_state . state + bias)
285  const DNN::EActivationFunction fAF = this->GetActivationFunction();
286  Matrix_t tmpState(fState.GetNrows(), fState.GetNcols());
287  Architecture_t::MultiplyTranspose(tmpState, fState, fWeightsState);
288  Architecture_t::MultiplyTranspose(fState, input, fWeightsInput);
289  Architecture_t::ScaleAdd(fState, tmpState);
290  Architecture_t::AddRowWise(fState, fBiases);
291  Tensor_t inputActivFunc(dF);
292  Tensor_t tState(fState);
293 
294  // DNN::evaluateDerivative<Architecture_t>(dFt, fAF, fState);
295  // DNN::evaluate<Architecture_t>(tState, fAF);
296 
297  Architecture_t::Copy(inputActivFunc, tState);
298  Architecture_t::ActivationFunctionForward(tState, fAF, fActivationDesc);
299 
300 }
301 
302 //____________________________________________________________________________
303 template <typename Architecture_t>
304 auto inline TBasicRNNLayer<Architecture_t>::Backward(Tensor_t &gradients_backward, // B x T x D
305  const Tensor_t &activations_backward) -> void // B x T x D
306  // std::vector<Matrix_t> & /*inp1*/, std::vector<Matrix_t> &
307  // /*inp2*/) -> void
308 {
309  // activations backward is input
310  // gradients_backward is activationGradients of layer before it, which is input layer
311  // currently gradient_backward is for input(x) and not for state
312  // TODO use this to change initial state??
313 
314 
315  bool dummy = false;
316  if (gradients_backward.GetSize() == 0) {
317  dummy = true;
318  }
319  Tensor_t arr_gradients_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
320  //for (size_t t = 0; t < fTimeSteps; ++t) arr_gradients_backward.emplace_back(this->GetBatchSize(), this->GetInputSize()); // T x B x D
321 
322  if (!dummy) {
323  // TODO gradients_backward will be written back on the matrix
324  //Architecture_t::Rearrange(arr_gradients_backward, gradients_backward);
325  }
326  Tensor_t arr_activations_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
327  //for (size_t t = 0; t < fTimeSteps; ++t) arr_activations_backward.emplace_back(this->GetBatchSize(), this->GetInputSize()); // T x B x D
328  Architecture_t::Rearrange(arr_activations_backward, activations_backward);
329 
330  Matrix_t state_gradients_backward(this->GetBatchSize(), fStateSize); // B x H
331  DNN::initialize<Architecture_t>(state_gradients_backward, DNN::EInitialization::kZero);
332 
333  Matrix_t initState(this->GetBatchSize(), fStateSize); // B x H
334  DNN::initialize<Architecture_t>(initState, DNN::EInitialization::kZero);
335 
336  Tensor_t arr_output ( fTimeSteps, this->GetBatchSize(), fStateSize);
337  //for (size_t t = 0; t < fTimeSteps; ++t) arr_output.emplace_back(this->GetBatchSize(), fStateSize);
338  Architecture_t::Rearrange(arr_output, this->GetOutput());
339 
340  Tensor_t arr_actgradients ( fTimeSteps, this->GetBatchSize(), fStateSize);
341  //for (size_t t = 0; t < fTimeSteps; ++t) arr_actgradients.emplace_back(this->GetBatchSize(), fStateSize);
342  Architecture_t::Rearrange(arr_actgradients, this->GetActivationGradients());
343 
344  // reinitialize weights and biases gradients to 0
345  fWeightInputGradients.Zero();
346  fWeightStateGradients.Zero();
347  fBiasGradients.Zero();
348 
349  for (size_t t = fTimeSteps; t > 0; t--) {
350  //const Matrix_t & currStateActivations = arr_output[t - 1];
351  Matrix_t actgrad_m = arr_actgradients.At(t - 1).GetMatrix();
352  Architecture_t::ScaleAdd(state_gradients_backward, actgrad_m);
353 
354  Matrix_t actbw_m = arr_activations_backward.At(t - 1).GetMatrix();
355  Matrix_t gradbw_m = arr_gradients_backward.At(t - 1).GetMatrix();
356 
357  // Architecture_t::PrintTensor(arr_actgradients.At(t - 1), "act grad");
358  // Architecture_t::PrintTensor(Tensor_t(state_gradients_backward), "state grad before");
359 
360  // compute derivatives of activations
361  Tensor_t df = fDerivatives.At(t-1);
362  Tensor_t dy = Tensor_t(state_gradients_backward);
363  //Tensor_t dy = arr_actgradients.At(t - 1);
364  Tensor_t y = arr_output.At(t-1);
365  Architecture_t::ActivationFunctionBackward(df, y,
366  dy, df, //do in place (should work)
367  this->GetActivationFunction(), fActivationDesc);
368 
369  Matrix_t df_m = df.GetMatrix();
370 
371  // Architecture_t::PrintTensor(df, "dy before");
372  if (t > 1) {
373  Matrix_t precStateActivations = arr_output.At(t - 2).GetMatrix();
374  CellBackward(state_gradients_backward, precStateActivations, actbw_m, gradbw_m, df_m);
375 
376  // std::cout << "at time " << t << std::endl;
377  // Architecture_t::PrintTensor(Tensor_t(state_gradients_backward), "state grad after");
378  // Architecture_t::PrintTensor(arr_gradients_backward.At(t-1),"dx");
379  // Architecture_t::PrintTensor(arr_activations_backward.At(t - 1),"x");
380  // Architecture_t::PrintTensor(df, "dy after");
381  } else {
382  const Matrix_t & precStateActivations = initState;
383  CellBackward(state_gradients_backward, precStateActivations, actbw_m, gradbw_m, df_m);
384 
385  // std::cout << "at time " << t << std::endl;
386  // Architecture_t::PrintTensor(Tensor_t(state_gradients_backward), "state grad after");
387  // Architecture_t::PrintTensor(arr_gradients_backward.At(t - 1), "dx");
388  // Architecture_t::PrintTensor(arr_activations_backward.At(t - 1), "x");
389  // Architecture_t::PrintTensor(df, "dy");
390  }
391  }
392  if (!dummy) {
393  Architecture_t::Rearrange(gradients_backward, arr_gradients_backward );
394  }
395  //Architecture_t::Rearrange(arr_activations_backward, activations_backward);
396 }
397 
398 //______________________________________________________________________________
399 template <typename Architecture_t>
400 auto inline TBasicRNNLayer<Architecture_t>::CellBackward(Matrix_t & state_gradients_backward,
401  const Matrix_t & precStateActivations,
402  const Matrix_t & input, Matrix_t & input_gradient, Matrix_t &dF)
403 -> Matrix_t &
404 {
405  return Architecture_t::RecurrentLayerBackward(state_gradients_backward, fWeightInputGradients, fWeightStateGradients,
406  fBiasGradients, dF, precStateActivations, fWeightsInput,
407  fWeightsState, input, input_gradient);
408 }
409 
410 //______________________________________________________________________________
411 template <typename Architecture_t>
412 void TBasicRNNLayer<Architecture_t>::AddWeightsXMLTo(void *parent)
413 {
414  auto layerxml = gTools().xmlengine().NewChild(parent, 0, "RNNLayer");
415 
416  // write All other info like stateSize, inputSize, timeSteps,rememberState
417  gTools().xmlengine().NewAttr(layerxml, 0, "StateSize", gTools().StringFromInt(this->GetStateSize()));
418  gTools().xmlengine().NewAttr(layerxml, 0, "InputSize", gTools().StringFromInt(this->GetInputSize()));
419  gTools().xmlengine().NewAttr(layerxml, 0, "TimeSteps", gTools().StringFromInt(this->GetTimeSteps()));
420  gTools().xmlengine().NewAttr(layerxml, 0, "RememberState", gTools().StringFromInt(this->IsRememberState()));
421 
422  // write weights and bias matrices
423  this->WriteMatrixToXML(layerxml, "InputWeights", this -> GetWeightsAt(0));
424  this->WriteMatrixToXML(layerxml, "StateWeights", this -> GetWeightsAt(1));
425  this->WriteMatrixToXML(layerxml, "Biases", this -> GetBiasesAt(0));
426 
427 
428 }
429 
430 //______________________________________________________________________________
431 template <typename Architecture_t>
432 void TBasicRNNLayer<Architecture_t>::ReadWeightsFromXML(void *parent)
433 {
434  // Read weights and biases
435  this->ReadMatrixXML(parent,"InputWeights", this -> GetWeightsAt(0));
436  this->ReadMatrixXML(parent,"StateWeights", this -> GetWeightsAt(1));
437  this->ReadMatrixXML(parent,"Biases", this -> GetBiasesAt(0));
438 
439 }
440 
441 
442 } // namespace RNN
443 } // namespace DNN
444 } // namespace TMVA
445 
446 #endif