Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Cpu.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 05/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12  //////////////////////////////////////////////////////////////////
13 // Definition of the TCpu architecture, which provides a //
14  // multi-threaded CPU implementation of the low-level interface //
15  // networks for Cpus using BLAS and Roots TThreadExecutor //
16  //////////////////////////////////////////////////////////////////
17 
18 #ifndef TMVA_DNN_ARCHITECTURES_CPU
19 #define TMVA_DNN_ARCHITECTURES_CPU
20 
21 #include "TMVA/DNN/Functions.h"
23 //#include "TMVA/DNN/CNN/Descriptors.h"
25 #include "TMVA/DNN/CNN/ConvLayer.h"
27 
31 
32 #include <vector>
33 
34 
35 class TRandom;
36 
37 namespace TMVA
38 {
39 namespace DNN
40 {
41  //class EActivationFunction;
42  struct DummyDescriptor {};
43  struct DummyFilterDescriptor {};
44  struct DummyConvolutionDescriptor {};
45  struct DummyDropoutDescriptor {};
46  struct DummyPoolingDescriptor {};
47  struct DummyConvolutionFwdAlgo {};
48  struct DummyConvolutionBwdDataAlgo {};
49  struct DummyConvolutionBwdFilterAlgo {};
50  struct DummyDataType {};
51 
52  struct DummyEmptyDescriptor {};
53 
54 /** The TCpu architecture class.
55  *
56  * Low-level interface class for multi-threaded CPU architectures. Contains as
57  * public types the declaration of the scalar, matrix and data loader types
58  * for this architecture as well as the remaining functions in the low-level
59  * interface in the form of static members.
60  */
61 template<typename AReal = Float_t>
62 class TCpu
63 {
64 private:
65  static TRandom * fgRandomGen;
66 public:
67 
68  using Scalar_t = AReal;
69  using Tensor_t = TCpuTensor<AReal>;
70  using Matrix_t = TCpuMatrix<AReal>;
71  using HostBuffer_t = TCpuBuffer<AReal>;
72  using DeviceBuffer_t = TCpuBuffer<AReal>;
73 
74  using ActivationDescriptor_t = DummyDescriptor;
75  using ConvolutionDescriptor_t = DummyDescriptor;
76  using FilterDescriptor_t = DummyDescriptor;
77  using DropoutDescriptor_t = DummyDescriptor;
78  //using OpTensorDescriptor_t = DummyOpTensorDescriptor;
79  using PoolingDescriptor_t = DummyDescriptor;
80  using TensorDescriptor_t = DummyDescriptor;
81  //using ReductionDescriptor_t = DummyReduceTensorDescriptor;
82  using AlgorithmForward_t = DummyConvolutionFwdAlgo;
83  using AlgorithmBackward_t = DummyConvolutionBwdDataAlgo;
84  using AlgorithmHelper_t = DummyConvolutionBwdFilterAlgo;
85  using AlgorithmDataType_t = DummyDataType;
86  using ReduceTensorDescriptor_t = DummyDataType;
87 
88  using EmptyDescriptor_t = DummyDescriptor; // Used if a descriptor is not needed in a class
89 
90  using BNormLayer_t = TBatchNormLayer<TCpu<AReal>>;
91  using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
92  //using BNormWorkspace_t = CNN::TCNNWorkspace<BNormLayer_t>;
93  using ConvLayer_t = CNN::TConvLayer<TCpu<AReal>>;
94  using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
95  using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
96  using PoolingLayer_t = CNN::TMaxPoolLayer<TCpu<AReal>>;
97  using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
98  using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
99 
100  static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::ColumnMajor; }
101 
102  static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
103  return Tensor_t( {c,h*w,n}, GetTensorLayout());
104  }
105  static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
106  return Tensor_t( buffer, {c,h*w,n}, GetTensorLayout());
107  }
108  // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
109  // this function is used by the optimizers to stgore intermidiate weights representations
110  static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
111  if (!newWeights.empty()) newWeights.clear();
112  size_t n = weights.size();
113  for (size_t i = 0; i < n; ++i)
114  newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
115  }
116  //____________________________________________________________________________
117  //
118  // Architecture Initialization
119  //____________________________________________________________________________
120 
121  /** Initialize CNN data/operator descriptors. Not used at the moment.*/
122 
123  static void InitializeBNormDescriptors(TDescriptors * & /*descriptors*/,
124  BNormLayer_t */*L = nullptr*/) {}
125 
126  static void InitializeConvDescriptors(TDescriptors * & /*descriptors*/,
127  ConvLayer_t */*L = nullptr*/) {}
128  static void InitializePoolDescriptors(TDescriptors * & /*descriptors*/,
129  PoolingLayer_t */*L = nullptr*/) {}
130 
131  static void InitializeActivationDescriptor(ActivationDescriptor_t &/*descriptors*/, EActivationFunction /*activFunc */ , double /*coef*/ = 0.0) {}
132 
133  /** Release CNN data/operator descriptors. Not used at the moment.*/
134  static void ReleaseConvDescriptors(TDescriptors * & /*descriptors*/) {}
135  static void ReleasePoolDescriptors(TDescriptors * & /*descriptors*/) {}
136  static void ReleaseBNormDescriptors(TDescriptors * & /*descriptors*/) {}
137 
138  static void InitializeConvWorkspace(TWorkspace * & /*workspace*/,
139  TDescriptors * & /*descriptors*/,
140  const DNN::CNN::TConvParams & /*params*/,
141  ConvLayer_t */*L = nullptr*/) {}
142  static void InitializePoolDropoutWorkspace(TWorkspace * & /*workspace*/,
143  TDescriptors * & /*descriptors*/,
144  const DNN::CNN::TConvParams & /*params*/,
145  PoolingLayer_t */*L = nullptr*/) {}
146 
147  static void FreeConvWorkspace(TWorkspace * & /*workspace*/, ConvLayer_t */*L = nullptr*/) {} ///< Only used for certain cudnn on-device memory
148  static void FreePoolDropoutWorkspace(TWorkspace * & /*workspace*/, PoolingLayer_t */*L = nullptr*/) {}
149 
150  static void ReleaseDescriptor(ActivationDescriptor_t & /* activationDescr */) {}
151 
152  //____________________________________________________________________________
153  //
154  // Propagation
155  //____________________________________________________________________________
156 
157  /** @name Forward Propagation
158  * Low-level functions required for the forward propagation of activations
159  * through the network.
160  */
161  ///@{
162  /** Matrix-multiply \p input with the transpose of \pweights and
163  * write the results into \p output. */
164  static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights);
165 
166  static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights) {
167  Matrix_t output_matrix = output.GetMatrix();
168  MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
169  //ensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
170  }
171 
172  /** Add the vectors biases row-wise to the matrix output */
173  static void AddRowWise(Matrix_t &output,const Matrix_t &biases);
174 
175  static void AddRowWise(Tensor_t &output, const Matrix_t &biases) {
176  Matrix_t output_matrix = output.GetMatrix();
177  AddRowWise(output_matrix, biases);
178  //Tensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
179  }
180 
181  /** @name Backward Propagation (Dense Layers)
182  * Low-level functions required for the forward propagation of activations
183  * through the network.
184  */
185  ///@{
186  /** Perform the complete backward propagation step. If the provided
187  * \p activationGradientsBackward matrix is not empty, compute the
188  * gradients of the objective function with respect to the activations
189  * of the previous layer (backward direction).
190  * Also compute the weight and the bias gradients. Modifies the values
191  * in \p df and thus produces only a valid result, if it is applied the
192  * first time after the corresponding forward propagation has been per-
193  * formed. */
194  static void Backward(Tensor_t & activationGradientsBackward,
195  Matrix_t & weightGradients,
196  Matrix_t & biasGradients,
197  const Tensor_t & df,
198  const Tensor_t & activationGradients,
199  const Matrix_t & weights,
200  const Tensor_t & activationBackward);
201 
202  /** Adds a the elements in matrix B scaled by c to the elements in
203  * the matrix A. This is required for the weight update in the gradient
204  * descent step.*/
205  static void ScaleAdd(Matrix_t & A,
206  const Matrix_t & B,
207  Scalar_t beta = 1.0);
208 
209  static void Copy(Matrix_t & B,
210  const Matrix_t & A);
211 
212  // copy from another type of matrix
213  template<typename AMatrix_t>
214  static void CopyDiffArch(Matrix_t & B, const AMatrix_t & A);
215 
216 
217  /** Above functions extended to vectors */
218  static void ScaleAdd(Tensor_t & A,
219  const Tensor_t & B,
220  Scalar_t beta = 1.0);
221 
222  static void Copy(Tensor_t & A,
223  const Tensor_t & B);
224 
225  // copy from another tensor
226  template<typename ATensor_t>
227  static void CopyDiffArch(Tensor_t & A,
228  const ATensor_t & B);
229 
230  // copy from vector of matrices of different types
231  template<typename AMatrix_t>
232  static void CopyDiffArch(std::vector<Matrix_t> & A,
233  const std::vector<AMatrix_t> & B);
234 
235  ///@}
236 
237  //____________________________________________________________________________
238  //
239  // Activation Functions
240  //____________________________________________________________________________
241 
242  /** @name Activation Functions
243  * For each activation function, the low-level interface contains two routines.
244  * One that applies the acitvation function to a matrix and one that evaluate
245  * the derivatives of the activation function at the elements of a given matrix
246  * and writes the results into the result matrix.
247  */
248  ///@{
249  /* impl using Matrix */
250  /*inline void evaluate(Matrix_t &A, EActivationFunction f)
251  {
252  Tensor_t tA(A);
253  evaluate<TCpu<AReal>>(tA,f);
254  }*/
255 
256  static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
257  const ActivationDescriptor_t activationDescr,
258  const double coef = 0.0, const Scalar_t alpha = 1,
259  const Scalar_t beta = 0);
260 
261  /** Computes the gradient of the activation function */
262  static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
263  const Tensor_t & dY, const Tensor_t & X,
264  EActivationFunction activFunct,
265  const ActivationDescriptor_t activationDescr,
266  const Scalar_t alpha = 1,
267  const Scalar_t beta = 0);
268 
269  static void IdentityDerivative(Tensor_t & B,
270  const Tensor_t &A);
271 
272  static void Relu(Tensor_t & B);
273  static void ReluDerivative(Tensor_t & B,
274  const Tensor_t & A);
275 
276  static void Sigmoid(Tensor_t & B);
277  static void SigmoidDerivative(Tensor_t & B,
278  const Tensor_t & A);
279 
280  static void Tanh(Tensor_t & B);
281  static void TanhDerivative(Tensor_t & B,
282  const Tensor_t & A);
283 
284  static void SymmetricRelu(Tensor_t & B);
285  static void SymmetricReluDerivative(Tensor_t & B,
286  const Tensor_t & A);
287 
288  static void SoftSign(Tensor_t & B);
289  static void SoftSignDerivative(Tensor_t & B,
290  const Tensor_t & A);
291 
292  static void Gauss(Tensor_t & B);
293  static void GaussDerivative(Tensor_t & B,
294  const Tensor_t & A);
295  ///@}
296 
297  //____________________________________________________________________________
298  //
299  // Loss Functions
300  //____________________________________________________________________________
301 
302  /** @name Loss Functions
303  * Loss functions compute a scalar value given the \p output of the network
304  * for a given training input and the expected network prediction \p Y that
305  * quantifies the quality of the prediction. For each function also a routing
306  * that computes the gradients (suffixed by Gradients) must be provided for
307  * the starting of the backpropagation algorithm.
308  */
309  ///@{
310 
311  static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
312  const Matrix_t &weights);
313  static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
314  const Matrix_t &output, const Matrix_t &weights);
315 
316  /** Sigmoid transformation is implicitly applied, thus \p output should
317  * hold the linear activations of the last layer in the net. */
318  static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
319  const Matrix_t &weights);
320 
321  static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
322  const Matrix_t &output, const Matrix_t &weights);
323 
324  /** Softmax transformation is implicitly applied, thus \p output should
325  * hold the linear activations of the last layer in the net. */
326  static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
327  const Matrix_t &weights);
328  static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
329  const Matrix_t &output, const Matrix_t &weights);
330  ///@}
331 
332  //____________________________________________________________________________
333  //
334  // Output Functions
335  //____________________________________________________________________________
336 
337  /** @name Output Functions
338  * Output functions transform the activations \p output of the
339  * output layer in the network to a valid prediction \p YHat for
340  * the desired usage of the network, e.g. the identity function
341  * for regression or the sigmoid transformation for two-class
342  * classification.
343  */
344  ///@{
345  static void Sigmoid(Matrix_t &YHat,
346  const Matrix_t & );
347  static void Softmax(Matrix_t &YHat,
348  const Matrix_t & );
349  ///@}
350 
351  //____________________________________________________________________________
352  //
353  // Regularization
354  //____________________________________________________________________________
355 
356  /** @name Regularization
357  * For each regularization type two functions are required, one named
358  * <tt><Type>Regularization</tt> that evaluates the corresponding
359  * regularization functional for a given weight matrix and the
360  * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
361  * component in the gradients to the provided matrix.
362  */
363  ///@{
364 
365  static Scalar_t L1Regularization(const Matrix_t & W);
366  static void AddL1RegularizationGradients(Matrix_t & A,
367  const Matrix_t & W,
368  Scalar_t weightDecay);
369 
370  static Scalar_t L2Regularization(const Matrix_t & W);
371  static void AddL2RegularizationGradients(Matrix_t & A,
372  const Matrix_t & W,
373  Scalar_t weightDecay);
374  ///@}
375 
376  //____________________________________________________________________________
377  //
378  // Initialization
379  //____________________________________________________________________________
380 
381  /** @name Initialization
382  * For each initialization method, one function in the low-level interface
383  * is provided. The naming scheme is <p>Initialize<Type></p> for a given
384  * initialization method Type.
385  */
386  ///@{
387 
388  static void InitializeGauss(Matrix_t & A);
389  static void InitializeUniform(Matrix_t & A);
390  static void InitializeIdentity(Matrix_t & A);
391  static void InitializeZero(Matrix_t & A);
392  static void InitializeGlorotNormal(Matrix_t & A);
393  static void InitializeGlorotUniform(Matrix_t & A);
394 
395  // return static instance of random generator used for initialization
396  // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
397  static TRandom & GetRandomGenerator();
398  // set random seed for the static geenrator
399  // if the static geneerator does not exists it is created
400  static void SetRandomSeed(size_t seed);
401  ///@}
402 
403  //____________________________________________________________________________
404  //
405  // Dropout
406  //____________________________________________________________________________
407 
408  /** @name Dropout
409  */
410  ///@{
411 
412  /** Apply dropout with activation probability \p p to the given
413  * tensor \p A and scale the result by reciprocal of \p p. */
414  static void DropoutForward(Tensor_t & A,
415  TDescriptors * descriptors,
416  TWorkspace * workspace,
417  Scalar_t p);
418 
419  static void DropoutForward(Matrix_t & A, Scalar_t p) {
420  Tensor_t tA(A);
421  DropoutForward( tA, static_cast<TDescriptors *> (nullptr), static_cast<TWorkspace *> (nullptr), p );
422  }
423 
424  // Only needed for cuDNN
425  static void DropoutBackward(Tensor_t & /*A */,
426  TDescriptors * /*descriptors */,
427  TWorkspace * /*workspace*/) {}
428  ///@}
429 
430  //____________________________________________________________________________
431  //
432  // Batch Normalization
433  //____________________________________________________________________________
434 
435  /** @name Batch Normalization Layer Propagation
436  */
437  ///@{
438 
439  /** The input from each batch are normalized during training to have zero mean and unit variance
440  * and they are then scaled by two parameter, different for each input variable:
441  * - a scale factor \gamma gamma
442  * - an offset \beta beta */
443  static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
444  Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
445  Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
446  Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
447 
448 
449  /** During inference the inputs are not normalized using the batch mean but the previously computed
450  * at running mean and variance */
451  static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
452  Tensor_t &y, const Matrix_t &runningMeans,
453  const Matrix_t &runningVars, Scalar_t epsilon,
454  const TensorDescriptor_t &);
455 
456  /**
457  * */
458  static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
459  Matrix_t &gamma, // Matrix_t &beta, (not needed)
460  Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
461  const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
462 
463  // helper function for BNorm layer
464  static Tensor_t BatchNormLayerReshapeTensor(int axis, const Tensor_t &x);
465 
466  ///@}
467 
468  //____________________________________________________________________________
469  //
470  // Convolutional Layer Propagation
471  //____________________________________________________________________________
472 
473  /** @name Forward Propagation in Convolutional Layer
474  */
475  ///@{
476 
477  /** Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperparameters.
478  */
479  static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride);
480 
481  /** Transform the matrix B in local view format, suitable for
482  * convolution, and store it in matrix A */
483  static void Im2col(Matrix_t &A, const Matrix_t &B, size_t imgHeight, size_t imgWidth, size_t fltHeight,
484  size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight,
485  size_t zeroPaddingWidth);
486 
487  static void Im2colIndices(std::vector<int> &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight,
488  size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
489  size_t zeroPaddingHeight, size_t zeroPaddingWidth);
490  static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector<int> &V);
491 
492  /** Rotates the matrix \p B, which is representing a weights,
493  * and stores them in the matrix \p A. */
494  static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight,
495  size_t filterWidth, size_t numFilters);
496 
497  /** Add the biases in the Convolutional Layer. */
498  static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
499  ///@}
500 
501  /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
502  static void PrepareInternals(Tensor_t &) {}
503 
504  /** Forward propagation in the Convolutional layer */
505  static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, const Tensor_t &input,
506  const Matrix_t &weights, const Matrix_t &biases, const DNN::CNN::TConvParams &params,
507  EActivationFunction activFunc, Tensor_t & /* inputPrime */,
508  const ConvDescriptors_t & /*descriptors*/, // Empty struct for cuda architecture
509  ConvWorkspace_t & /*workspace*/); // Empty struct for cuda architecture
510  // void * cudnnWorkspace = nullptr); // Remains nullptr for cuda architecture
511 
512  /** @name Backward Propagation in Convolutional Layer
513  */
514  ///@{
515 
516  /** Perform the complete backward propagation step in a Convolutional Layer.
517  * If the provided \p activationGradientsBackward matrix is not empty, compute the
518  * gradients of the objective function with respect to the activations
519  * of the previous layer (backward direction).
520  * Also compute the weight and the bias gradients. Modifies the values
521  * in \p df and thus produces only a valid result, if it is applied the
522  * first time after the corresponding forward propagation has been per-
523  * formed. */
524  static void
525  ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients,
526  Tensor_t &df, Tensor_t &activationGradients, const Matrix_t &weights,
527  const Tensor_t &activationBackward, const Tensor_t &outputTensor, EActivationFunction activFunc,
528  const ConvDescriptors_t & /*descriptors*/, ConvWorkspace_t & /*workspace*/, size_t batchSize,
529  size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width,
530  size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews);
531 
532  /** Utility function for calculating the activation gradients of the layer
533  * before the convolutional layer. */
534  static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, const Tensor_t &df,
535  const Matrix_t &weights, size_t batchSize, size_t inputHeight,
536  size_t inputWidth, size_t depth, size_t height, size_t width,
537  size_t filterDepth, size_t filterHeight, size_t filterWidth);
538 
539  /** Utility function for calculating the weight gradients of the convolutional
540  * layer. */
541  static void CalculateConvWeightGradients(Matrix_t &weightGradients, const Tensor_t &df,
542  const Tensor_t &activations_backward, size_t batchSize, size_t inputHeight,
543  size_t inputWidth, size_t depth, size_t height, size_t width,
544  size_t filterDepth, size_t filterHeight, size_t filterWidth,
545  size_t nLocalViews);
546 
547  /** Utility function for calculating the bias gradients of the convolutional
548  * layer */
549  static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df, size_t batchSize, size_t depth,
550  size_t nLocalViews);
551  ///@}
552 
553  //____________________________________________________________________________
554  //
555  // Max Pooling Layer Propagation
556  //____________________________________________________________________________
557  /** @name Forward Propagation in Max Pooling Layer
558  */
559  ///@{
560 
561  /** Downsample the matrix \p C to the matrix \p A, using max
562  * operation, such that the winning indices are stored in matrix
563  * \p B. */
564  static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C, const PoolingDescriptors_t & /*descriptors*/,
565  PoolingWorkspace_t & /*workspace*/, size_t imgHeight, size_t imgWidth, size_t fltHeight,
566  size_t fltWidth, size_t strideRows, size_t strideCols);
567 
568  ///@}
569 
570  /** @name Backward Propagation in Max Pooling Layer
571  */
572  ///@{
573  /** Perform the complete backward propagation step in a Pooling Layer. Based on the
574  * winning idices stored in the index matrix, it just forwards the actiovation
575  * gradients to the previous layer. */
576  static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
577  const Tensor_t &indexMatrix, const Tensor_t & /*inputActivation*/,
578  const Tensor_t & /*outputTensor*/, const PoolingDescriptors_t & /*descriptors*/,
579  PoolingWorkspace_t & /*workspace*/, size_t imgHeight, size_t imgWidth,
580  size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
581  size_t nLocalViews);
582 
583  ///@}
584 
585  //____________________________________________________________________________
586  //
587  // Reshape Layer Propagation
588  //____________________________________________________________________________
589  /** @name Forward and Backward Propagation in Reshape Layer
590  */
591  ///@{
592 
593  /** Transform the matrix \p B to a matrix with different dimensions \p A */
594  static void Reshape(Matrix_t &A, const Matrix_t &B);
595 
596  /** Flattens the tensor \p B, such that each matrix, is stretched in
597  * one row, resulting with a matrix \p A. */
598  static void Flatten(Tensor_t &A, const Tensor_t &B); // size_t size, size_t nRows, size_t nCols);
599 
600  /** Transforms each row of \p B to a matrix and stores it in the
601  * tensor \p B. */
602  static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
603 
604  /** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
605  static void Rearrange(Tensor_t &out, const Tensor_t &in);
606 
607  /** Backward pass for Recurrent Networks */
608  static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
609  Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients,
610  Matrix_t &bias_gradients,
611  Matrix_t &df, // DxH
612  const Matrix_t &state, // BxH
613  const Matrix_t &weights_input, // HxD
614  const Matrix_t &weights_state, // HxH
615  const Matrix_t &input, // BxD
616  Matrix_t &input_gradient);
617 
618  ///@}
619 
620  //____________________________________________________________________________
621  //
622  // Additional Arithmetic Functions
623  //____________________________________________________________________________
624 
625  /** @name Additional Arithmetic Functions
626  *
627  * Additional arithmetic on CUDA matrices used to implement the low-level
628  * interface.
629  */
630  ///@{
631 
632  /** Standard multiplication of two matrices \p A and \p B with the result being
633  * written into C.
634  */
635  static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B);
636  /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
637  * result being written into C.
638  */
639  static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha = 1.0,
640  Scalar_t beta = 0.);
641  /** In-place Hadamard (element-wise) product of matrices \p A and \p B
642  * with the result being written into \p A.
643  */
644  static void Hadamard(Tensor_t &A, const Tensor_t &B);
645  static void Hadamard(Matrix_t &A, const Matrix_t &B);
646  // {
647  // Tensor_t tA(A);
648  // Hadamard( tA, Tensor_t(B));
649  // }
650 
651  /** Sum columns of (m x n) matrixx \p A and write the results into the first
652  * m elements in \p A.
653  */
654  static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.);
655 
656  /** Compute the sum of all elements in \p A */
657  static Scalar_t Sum(const Matrix_t &A);
658 
659  /** Check two matrices for equality, taking floating point arithmetic errors into account. */
660  static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
661 
662  /** Add the constant \p beta to all the elements of matrix \p A and write the
663  * result into \p A.
664  */
665  static void ConstAdd(Matrix_t &A, Scalar_t beta);
666 
667  /** Multiply the constant \p beta to all the elements of matrix \p A and write the
668  * result into \p A.
669  */
670  static void ConstMult(Matrix_t &A, Scalar_t beta);
671 
672  /** Reciprocal each element of the matrix \p A and write the result into
673  * \p A
674  */
675  static void ReciprocalElementWise(Matrix_t &A);
676 
677  /** Square each element of the matrix \p A and write the result into
678  * \p A
679  */
680  static void SquareElementWise(Matrix_t &A);
681 
682  /** Square root each element of the matrix \p A and write the result into
683  * \p A
684  */
685  static void SqrtElementWise(Matrix_t &A);
686 
687  // optimizer functions
688  static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps);
689  static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta);
690  static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta);
691 
692  // printing of tensor
693  static void PrintTensor(const Tensor_t &A, const std::string name = "Cpu-tensor", bool truncate = false);
694 
695 };
696 
697 //____________________________________________________________________________
698 template <typename AReal>
699 template <typename AMatrix_t>
700 void TCpu<AReal>::CopyDiffArch(TCpuMatrix<AReal> &B,
701  const AMatrix_t &A)
702 {
703  // copy from another architecture using the reference one
704  // this is not very efficient since creates temporary objects
705  TMatrixT<AReal> tmp = A; // this works also if A is a tensor
706  Copy(B, TCpuMatrix<AReal>(tmp) );
707 }
708 
709 //____________________________________________________________________________
710 template <typename AReal>
711 template <typename ATensor_t>
712 void TCpu<AReal>::CopyDiffArch(TCpuTensor<AReal> &B,
713  const ATensor_t &A)
714 {
715 
716  R__ASSERT(A.GetSize() == B.GetSize());
717  // suppose A is of (B,D,H.W) and we want to convert to B,HW,D or (D,HW,B) in ColumnMajor format
718  for (size_t i = 0; i < A.GetFirstSize(); ++i) {
719  TMatrixT<AReal> tmpIn = A.At(i); // this convert tensor (B,D,H,W) in (D,H,W)i -> (D,HW)i
720 
721  TCpuMatrix<AReal> tmpOut = B.At(i).GetMatrix(); // matrix (D,HW)
722  Copy(tmpOut, TCpuMatrix<AReal>(tmpIn));
723  }
724 
725  // ATensor_t tmpIn = A.Reshape({A.GetNrows(), A.GetNcols()});
726  // auto tmpOut = B.Reshape({A.GetNrows(), A.GetNcols()});
727  // Matrix_t mOut = tmpOut.GetMatrix();
728  // CopyDiffArch(mOut, tmpIn.GetMatrix());
729 }
730 
731 // Implementation using vector of matrices for the weights
732 template <typename AReal>
733 template <typename AMatrix_t>
734 void TCpu<AReal>::CopyDiffArch(std::vector<TCpuMatrix<AReal>> &A, const std::vector<AMatrix_t> &B)
735 {
736  for (size_t i = 0; i < A.size(); ++i) {
737  CopyDiffArch(A[i], B[i]);
738  }
739 }
740 
741 template <typename AReal>
742 void TCpu<AReal>::PrintTensor(const typename TCpu<AReal>::Tensor_t & A, const std::string name, bool truncate )
743 {
744  std::cout << name << " size = " << A.GetSize() << " shape = { ";
745  auto shape = A.GetShape();
746  for (size_t k = 0; k < shape.size()-1; ++k)
747  std::cout << shape[k] << " , ";
748  std::cout << shape.back() << " } ";
749 
750  // print elements
751  // need to find way to nice printing all elements
752  std::cout << " tensor count " << A.GetBufferUseCount() << std::endl;
753  if (A.GetShape().size() == 2 ) {
754  for (size_t i = 0; i < A.GetShape()[0]; ++i) {
755  std::cout << "{ ";
756  size_t n = A.GetShape()[1];
757  if (truncate) n = std::min(n,size_t(10));
758  for (size_t j = 0; j < n; ++j) {
759  std::cout << A(i,j) << " ";
760  }
761  if (truncate && n < A.GetShape()[1]) std::cout << " ...... ";
762  std::cout << " } " << std::endl;
763  }
764  } else if (A.GetShape().size() == 3 ) {
765  for (size_t i = 0; i < A.GetFirstSize(); ++i) {
766  std::cout << "{ ";
767  for (size_t j = 0; j < A.GetHSize(); ++j) {
768  std::cout << "{ ";
769  size_t n = A.GetWSize();
770  if (truncate) n = std::min(n,size_t(10));
771  for (size_t k = 0; k < n; ++k) {
772  std::cout << A(i,j,k) << " ";
773  }
774  if (truncate && n < A.GetWSize()) std::cout << " ...... ";
775  std::cout << " } " << std::endl;
776  }
777  std::cout << " } " << std::endl;
778  }
779  }
780  else {
781  for (size_t l = 0; l < A.GetSize(); ++l) {
782  std::cout << A.GetData()[l] << " ";
783  }
784  std::cout << "\n";
785  }
786 }
787 
788 
789 
790 
791 } // namespace DNN
792 } // namespace TMVA
793 
794 #endif