Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Cuda.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 05/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////
13 // Definition of the TCuda architecture class, which provides an //
14 // implementation of the low-level functionality for neural //
15 // networks for the CUDA computing architectures. //
16 ///////////////////////////////////////////////////////////////////
17 
18 #ifndef TMVA_DNN_ARCHITECTURES_CUDA
19 #define TMVA_DNN_ARCHITECTURES_CUDA
20 
21 #include "TMVA/DNN/Functions.h"
24 #include "TMVA/DNN/CNN/ConvLayer.h"
26 
27 
28 #include "cuda.h"
29 #include "Cuda/CudaBuffers.h"
30 #include "Cuda/CudaMatrix.h"
31 #include "Cuda/CudaTensor.h"
32 #include "TMVA/DNN/DataLoader.h"
33 #include <utility>
34 #include <vector>
35 
36 class TRandom;
37 
38 namespace TMVA
39 {
40 namespace DNN
41 {
42  struct CudaActivationDescriptor {};
43  struct CudaFilterDescriptor {};
44  struct CudaConvolutionDescriptor {};
45  struct CudaDropoutDescriptor {};
46  struct CudaPoolingDescriptor {};
47  struct CudaConvolutionFwdAlgo {};
48  struct CudaConvolutionBwdDataAlgo {};
49  struct CudaConvolutionBwdFilterAlgo {};
50  struct CudaDataType {};
51  struct DummyType {};
52 
53  struct CudaEmptyDescriptor {};
54 
55 /** The TCuda architecture class.
56  *
57  * Low-level interface class for CUDA computing architectures. Contains as
58  * public types the declaration of the scalar, matrix and buffer types
59  * for this architecture as well as the remaining functions in the low-level
60  * interface in the form of static members.
61  */
62 template<typename AReal = Float_t>
63 class TCuda
64 {
65 private:
66  static TRandom * fgRandomGen;
67 public:
68 
69  using AFloat = AReal;
70  using Scalar_t = AFloat;
71 
72  using Matrix_t = TCudaMatrix<AFloat>;
73  using Tensor_t = TCudaTensor<AFloat>;
74  using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
75  using HostBuffer_t = TCudaHostBuffer<AFloat>;
76 
77  using ActivationDescriptor_t = CudaActivationDescriptor;
78  using ConvolutionDescriptor_t = CudaConvolutionDescriptor;
79  using FilterDescriptor_t = CudaFilterDescriptor;
80  using DropoutDescriptor_t = CudaDropoutDescriptor;
81  //using OpTensorDescriptor_t = CudaOpTensorDescriptor;
82  using PoolingDescriptor_t = CudaPoolingDescriptor;
83  using TensorDescriptor_t = DummyType;
84  //using ReductionDescriptor_t = CudaReduceTensorDescriptor;
85  using AlgorithmForward_t = CudaConvolutionFwdAlgo;
86  using AlgorithmBackward_t = CudaConvolutionBwdDataAlgo;
87  using AlgorithmHelper_t = CudaConvolutionBwdFilterAlgo;
88  using AlgorithmDataType_t = CudaDataType;
89  using ReduceTensorDescriptor_t = DummyType;
90 
91  using EmptyDescriptor_t = CudaEmptyDescriptor; // Used if a descriptor is not needed in a class
92 
93  using BNormLayer_t = TBatchNormLayer<TCuda<AReal>>;
94  using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
95 
96  using ConvLayer_t = CNN::TConvLayer<TCuda<AReal>>;
97  using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
98  using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
99  using PoolingLayer_t = CNN::TMaxPoolLayer<TCuda<AReal>>;
100  using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
101  using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
102 
103  static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::ColumnMajor; }
104 
105  static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
106  return Tensor_t( {c,h*w,n}, GetTensorLayout());
107  }
108  static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
109  return Tensor_t( buffer, {c,h*w, n}, GetTensorLayout(), 0, 0);
110  }
111 
112  // create a weight tensor/matrix from another tensor using its shape
113  // static Matrix_t CreateWeightTensor( Matrix_t & A) {
114  // return Matrix_t( A.GetNrows(), A.GetNcols());
115  // }
116  // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
117  // this function is used by the optimizers to stgore intermidiate weights representations
118  static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
119  if (!newWeights.empty()) newWeights.clear();
120  size_t n = weights.size();
121  for (size_t i = 0; i < n; ++i)
122  newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
123  }
124 
125  //____________________________________________________________________________
126  //
127  // Architecture Initialization
128  //____________________________________________________________________________
129 
130  /** Initialize CNN data/operator descriptors. Not used at the moment.*/
131 
132  static void InitializeBNormDescriptors(TDescriptors * & /*descriptors*/,
133  BNormLayer_t */*L = nullptr*/) {
134  Error("InitializeBNormDescriptrs", "Batch normalization on GPU is supported only with Cudnn");
135  }
136 
137  static void InitializeConvDescriptors(TDescriptors *& /*descriptors*/, ConvLayer_t * /*L = nullptr*/) {}
138 
139  static void InitializePoolDescriptors(TDescriptors *& /*descriptors*/, PoolingLayer_t * /*L = nullptr*/) {}
140 
141  static void InitializeActivationDescriptor(ActivationDescriptor_t &/*descriptors*/, EActivationFunction /*activFunc */ , double /*coef*/ = 0.0) {}
142 
143  /** Release CNN data/operator descriptors. Not used at the moment.*/
144  static void ReleaseConvDescriptors(TDescriptors * & /*descriptors*/) {}
145  static void ReleasePoolDescriptors(TDescriptors * & /*descriptors*/) {}
146  static void ReleaseBNormDescriptors(TDescriptors *& /*descriptors*/) {}
147 
148  static void InitializeConvWorkspace(TWorkspace * & /*workspace*/,
149  TDescriptors * & /*descriptors*/,
150  const DNN::CNN::TConvParams & /*params*/,
151  ConvLayer_t */*L = nullptr*/) {}
152  static void InitializePoolDropoutWorkspace(TWorkspace * & /*workspace*/,
153  TDescriptors * & /*descriptors*/,
154  const DNN::CNN::TConvParams & /*params*/,
155  PoolingLayer_t */*L = nullptr*/) {}
156 
157  static void ReleaseDescriptor(ActivationDescriptor_t & /*activationDescr*/) {}
158 
159  static void FreeConvWorkspace(TWorkspace * & /*workspace*/, ConvLayer_t */*L = nullptr*/) {} ///< Only used for certain cudnn on-device memory
160  static void FreePoolDropoutWorkspace(TWorkspace * & /*workspace*/, PoolingLayer_t */*L = nullptr*/) {}
161 
162 
163  //____________________________________________________________________________
164  //
165  // Propagation
166  //____________________________________________________________________________
167 
168  /** @name Forward Propagation
169  * Low-level functions required for the forward propagation of activations
170  * through the network.
171  */
172  ///@{
173  /** Matrix-multiply \p input with the transpose of \pweights and
174  * write the results into \p output. */
175  static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights);
176 
177  static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights) {
178  Matrix_t output_matrix = output.GetMatrix();
179  MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
180  //ensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
181  }
182 
183  /** Add the vectors biases row-wise to the matrix output */
184  static void AddRowWise(Matrix_t &output,const Matrix_t &biases);
185 
186  static void AddRowWise(Tensor_t &output, const Matrix_t &biases) {
187  Matrix_t output_matrix = output.GetMatrix();
188  AddRowWise(output_matrix, biases);
189  //Tensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
190  }
191 
192  /** @name Backward Propagation (Dense Layers)
193  * Low-level functions required for the forward propagation of activations
194  * through the network.
195  */
196  ///@{
197  /** Perform the complete backward propagation step. If the provided
198  * \p activationGradientsBackward matrix is not empty, compute the
199  * gradients of the objective function with respect to the activations
200  * of the previous layer (backward direction).
201  * Also compute the weight and the bias gradients. Modifies the values
202  * in \p df and thus produces only a valid result, if it is applied the
203  * first time after the corresponding forward propagation has been per-
204  * formed. */
205  static void Backward(Tensor_t & activationGradientsBackward,
206  Matrix_t & weightGradients,
207  Matrix_t & biasGradients,
208  const Tensor_t & df,
209  const Tensor_t & activationGradients,
210  const Matrix_t & weights,
211  const Tensor_t & activationBackward);
212 
213  /** Adds a the elements in matrix B scaled by c to the elements in
214  * the matrix A. This is required for the weight update in the gradient
215  * descent step.*/
216  static void ScaleAdd(Matrix_t & A,
217  const Matrix_t & B,
218  Scalar_t beta = 1.0);
219 
220  static void Copy(Matrix_t & B,
221  const Matrix_t & A);
222 
223  // copy from another type of matrix
224  template<typename AMatrix_t>
225  static void CopyDiffArch(Matrix_t & B, const AMatrix_t & A);
226 
227 
228  /** Above functions extended to vectors */
229  static void ScaleAdd(Tensor_t & A,
230  const Tensor_t & B,
231  Scalar_t beta = 1.0);
232 
233  static void Copy(Tensor_t & A,
234  const Tensor_t & B);
235 
236  // copy from another tensor
237  template<typename ATensor_t>
238  static void CopyDiffArch(Tensor_t & A,
239  const ATensor_t & B);
240 
241  // copy from vector of matrices of different types
242  template<typename AMatrix_t>
243  static void CopyDiffArch(std::vector<Matrix_t> & A,
244  const std::vector<AMatrix_t> & B);
245 
246  ///@}
247 
248  //____________________________________________________________________________
249  //
250  // Activation Functions
251  //____________________________________________________________________________
252 
253  /** @name Activation Functions
254  * For each activation function, the low-level interface contains two routines.
255  * One that applies the acitvation function to a matrix and one that evaluate
256  * the derivatives of the activation function at the elements of a given matrix
257  * and writes the results into the result matrix.
258  */
259  ///@{
260  /* impl using Matrix */
261  /*inline void evaluate(Matrix_t &A, EActivationFunction f)
262  {
263  Tensor_t tA(A);
264  evaluate<TCuda<AReal>>(tA,f);
265  }*/
266  static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
267  const ActivationDescriptor_t activationDescr,
268  const double coef = 0.0, const AFloat alpha = 1,
269  const AFloat beta = 0);
270 
271  /** Computes the gradient of the activation function */
272  static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
273  const Tensor_t & dY, const Tensor_t & X,
274  EActivationFunction activFunct,
275  const ActivationDescriptor_t activationDescr,
276  const AFloat alpha = 1,
277  const AFloat beta = 0);
278 
279  static void IdentityDerivative(Tensor_t & B,
280  const Tensor_t &A);
281 
282  static void Relu(Tensor_t & B);
283  static void ReluDerivative(Tensor_t & B,
284  const Tensor_t & A);
285 
286  static void Sigmoid(Tensor_t & B);
287  static void SigmoidDerivative(Tensor_t & B,
288  const Tensor_t & A);
289 
290  static void Tanh(Tensor_t & B);
291  static void TanhDerivative(Tensor_t & B,
292  const Tensor_t & A);
293 
294  static void SymmetricRelu(Tensor_t & B);
295  static void SymmetricReluDerivative(Tensor_t & B,
296  const Tensor_t & A);
297 
298  static void SoftSign(Tensor_t & B);
299  static void SoftSignDerivative(Tensor_t & B,
300  const Tensor_t & A);
301 
302  static void Gauss(Tensor_t & B);
303  static void GaussDerivative(Tensor_t & B,
304  const Tensor_t & A);
305  ///@}
306 
307  //____________________________________________________________________________
308  //
309  // Loss Functions
310  //____________________________________________________________________________
311 
312  /** @name Loss Functions
313  * Loss functions compute a scalar value given the \p output of the network
314  * for a given training input and the expected network prediction \p Y that
315  * quantifies the quality of the prediction. For each function also a routing
316  * that computes the gradients (suffixed by Gradients) must be provided for
317  * the starting of the backpropagation algorithm.
318  */
319  ///@{
320 
321  static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
322  const Matrix_t &weights);
323  static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
324  const Matrix_t &output, const Matrix_t &weights);
325 
326  /** Sigmoid transformation is implicitly applied, thus \p output should
327  * hold the linear activations of the last layer in the net. */
328  static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
329  const Matrix_t &weights);
330 
331  static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
332  const Matrix_t &output, const Matrix_t &weights);
333 
334  /** Softmax transformation is implicitly applied, thus \p output should
335  * hold the linear activations of the last layer in the net. */
336  static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
337  const Matrix_t &weights);
338  static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
339  const Matrix_t &output, const Matrix_t &weights);
340  ///@}
341 
342  //____________________________________________________________________________
343  //
344  // Output Functions
345  //____________________________________________________________________________
346 
347  /** @name Output Functions
348  * Output functions transform the activations \p output of the
349  * output layer in the network to a valid prediction \p YHat for
350  * the desired usage of the network, e.g. the identity function
351  * for regression or the sigmoid transformation for two-class
352  * classification.
353  */
354  ///@{
355  static void Sigmoid(Matrix_t &YHat,
356  const Matrix_t & );
357  static void Softmax(Matrix_t &YHat,
358  const Matrix_t & );
359  ///@}
360 
361  //____________________________________________________________________________
362  //
363  // Regularization
364  //____________________________________________________________________________
365 
366  /** @name Regularization
367  * For each regularization type two functions are required, one named
368  * <tt><Type>Regularization</tt> that evaluates the corresponding
369  * regularization functional for a given weight matrix and the
370  * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
371  * component in the gradients to the provided matrix.
372  */
373  ///@{
374 
375  static Scalar_t L1Regularization(const Matrix_t & W);
376  static void AddL1RegularizationGradients(Matrix_t & A,
377  const Matrix_t & W,
378  Scalar_t weightDecay);
379 
380  static Scalar_t L2Regularization(const Matrix_t & W);
381  static void AddL2RegularizationGradients(Matrix_t & A,
382  const Matrix_t & W,
383  Scalar_t weightDecay);
384  ///@}
385 
386  //____________________________________________________________________________
387  //
388  // Initialization
389  //____________________________________________________________________________
390 
391  /** @name Initialization
392  * For each initialization method, one function in the low-level interface
393  * is provided. The naming scheme is <p>Initialize<Type></p> for a given
394  * initialization method Type.
395  */
396  ///@{
397 
398  static void InitializeGauss(Matrix_t & A);
399  static void InitializeUniform(Matrix_t & A);
400  static void InitializeIdentity(Matrix_t & A);
401  static void InitializeZero(Matrix_t & A);
402  static void InitializeGlorotNormal(Matrix_t & A);
403  static void InitializeGlorotUniform(Matrix_t & A);
404 
405  // return static instance of random generator used for initialization
406  // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
407  static TRandom & GetRandomGenerator();
408  // set random seed for the static geenrator
409  // if the static geneerator does not exists it is created
410  static void SetRandomSeed(size_t seed);
411  ///@}
412 
413  //____________________________________________________________________________
414  //
415  // Dropout
416  //____________________________________________________________________________
417 
418  /** @name Dropout
419  */
420  ///@{
421 
422  /** Apply dropout with activation probability \p p to the given
423  * tensor \p A and scale the result by reciprocal of \p p. */
424  static void DropoutForward(Tensor_t & A,
425  TDescriptors * descriptors,
426  TWorkspace * workspace,
427  Scalar_t p);
428 
429  static void DropoutForward(Matrix_t & A, Scalar_t p) {
430  Tensor_t tA(A);
431  DropoutForward( tA, static_cast<TDescriptors *> (nullptr), static_cast<TWorkspace *> (nullptr), p );
432  }
433 
434  static void DropoutBackward(Tensor_t & /* A */,
435  TDescriptors * /*descriptors */,
436  TWorkspace * /* workspace */ ) {}
437  ///@}
438 
439  //____________________________________________________________________________
440  //
441  // Batch Normalization
442  //____________________________________________________________________________
443 
444  /** @name Batch Normalization Layer Propagation
445  */
446  ///@{
447 
448  /** The input from each batch are normalized during training to have zero mean and unit variance
449  * and they are then scaled by two parameter, different for each input variable:
450  * - a scale factor \gamma gamma
451  * - an offset \beta beta */
452 
453  static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
454  Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
455  Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
456  Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
457 
458  /** During inference the inputs are not normalized using the batch mean but the previously computed
459  * at running mean and variance */
460 
461  static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta, Tensor_t &y,
462  const Matrix_t &runningMeans, const Matrix_t &runningVars,
463  Scalar_t epsilon, const TensorDescriptor_t &);
464 
465  static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
466  Matrix_t &gamma, // Matrix_t &beta, (not needed)
467  Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
468  const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
469 
470  //____________________________________________________________________________
471  //
472  // Convolutional Layer Propagation
473  //____________________________________________________________________________
474 
475  /** @name Forward Propagation in Convolutional Layer
476  */
477  ///@{
478 
479  /** Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperparameters. */
480  static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride);
481 
482  /** Transform the matrix B in local view format, suitable for
483  * convolution, and store it in matrix A */
484  static void Im2col(Matrix_t &A,
485  const Matrix_t &B,
486  size_t imgHeight,
487  size_t imgWidth,
488  size_t fltHeight,
489  size_t fltWidth,
490  size_t strideRows,
491  size_t strideCols,
492  size_t zeroPaddingHeight,
493  size_t zeroPaddingWidth);
494 
495  static void Im2colIndices(std::vector<int> &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight, size_t imgWidth, size_t fltHeight,
496  size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight,
497  size_t zeroPaddingWidth);
498  static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector<int> & V);
499 
500  /** Rotates the matrix \p B, which is representing a weights,
501  * and stores them in the matrix \p A. */
502  static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight,
503  size_t filterWidth, size_t numFilters);
504 
505  /** Add the biases in the Convolutional Layer. */
506  static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
507  ///@}
508 
509  /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
510  static void PrepareInternals(Tensor_t &) {}
511 
512  /** Forward propagation in the Convolutional layer */
513  static void ConvLayerForward(Tensor_t & output,
514  Tensor_t & inputActivationFunc,
515  const Tensor_t &input,
516  const Matrix_t &weights, const Matrix_t & biases,
517  const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
518  Tensor_t & /* inputPrime */,
519  const ConvDescriptors_t & /*descriptors*/, // Empty struct for cuda architecture
520  ConvWorkspace_t & /*workspace*/); // Empty struct for cuda architecture
521  //void * cudnnWorkspace = nullptr); // Remains nullptr for cuda architecture
522  /** @name Backward Propagation in Convolutional Layer
523  */
524  ///@{
525 
526  /** Perform the complete backward propagation step in a Convolutional Layer.
527  * If the provided \p activationGradientsBackward matrix is not empty, compute the
528  * gradients of the objective function with respect to the activations
529  * of the previous layer (backward direction).
530  * Also compute the weight and the bias gradients. Modifies the values
531  * in \p df and thus produces only a valid result, if it is applied the
532  * first time after the corresponding forward propagation has been per-
533  * formed. */
534  static void ConvLayerBackward(Tensor_t &activationGradientsBackward,
535  Matrix_t &weightGradients, Matrix_t &biasGradients,
536  Tensor_t &df,
537  Tensor_t &activationGradients,
538  const Matrix_t &weights,
539  const Tensor_t &activationBackward,
540  const Tensor_t & outputTensor,
541  EActivationFunction activFunc,
542  const ConvDescriptors_t & /*descriptors*/,
543  ConvWorkspace_t & /*workspace*/,
544  size_t batchSize, size_t inputHeight,
545  size_t inputWidth, size_t depth,
546  size_t height, size_t width,
547  size_t filterDepth, size_t filterHeight,
548  size_t filterWidth, size_t nLocalViews );
549 
550  /** Utility function for calculating the activation gradients of the layer
551  * before the convolutional layer. */
552  static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward,
553  const Tensor_t &df,
554  const Matrix_t &weights, size_t batchSize,
555  size_t inputHeight, size_t inputWidth, size_t depth, size_t height,
556  size_t width, size_t filterDepth, size_t filterHeight,
557  size_t filterWidth);
558 
559  /** Utility function for calculating the weight gradients of the convolutional
560  * layer. */
561  static void CalculateConvWeightGradients(Matrix_t &weightGradients,
562  const Tensor_t &df,
563  const Tensor_t &activations_backward,
564  size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth,
565  size_t height, size_t width, size_t filterDepth, size_t filterHeight,
566  size_t filterWidth, size_t nLocalViews);
567 
568  /** Utility function for calculating the bias gradients of the convolutional
569  * layer */
570  static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df,
571  size_t batchSize, size_t depth, size_t nLocalViews);
572  ///@}
573 
574  //____________________________________________________________________________
575  //
576  // Max Pooling Layer Propagation
577  //____________________________________________________________________________
578  /** @name Forward Propagation in Max Pooling Layer
579  */
580  ///@{
581 
582  /** Downsample the matrix \p C to the matrix \p A, using max
583  * operation, such that the winning indices are stored in matrix
584  * \p B. */
585  static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C,
586  const PoolingDescriptors_t & /*descriptors*/,
587  PoolingWorkspace_t & /*workspace*/,
588  size_t imgHeight, size_t imgWidth, size_t fltHeight,
589  size_t fltWidth, size_t strideRows, size_t strideCols);
590 
591  ///@}
592 
593  /** @name Backward Propagation in Max Pooling Layer
594  */
595  ///@{
596  /** Perform the complete backward propagation step in a Pooling Layer. Based on the
597  * winning idices stored in the index matrix, it just forwards the actiovation
598  * gradients to the previous layer. */
599  static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
600  const Tensor_t &activationGradients,
601  const Tensor_t &indexMatrix,
602  const Tensor_t & /*inputActivation*/,
603  const Tensor_t & /*outputTensor*/,
604  const PoolingDescriptors_t & /*descriptors*/,
605  PoolingWorkspace_t & /*workspace*/,
606  size_t imgHeight,
607  size_t imgWidth,
608  size_t fltHeight,
609  size_t fltWidth,
610  size_t strideRows,
611  size_t strideCols,
612  size_t nLocalViews);
613 
614  ///@}
615 
616  //____________________________________________________________________________
617  //
618  // Reshape Layer Propagation
619  //____________________________________________________________________________
620  /** @name Forward and Backward Propagation in Reshape Layer
621  */
622  ///@{
623 
624  /** Transform the matrix \p B to a matrix with different dimensions \p A */
625  static void Reshape(Matrix_t &A, const Matrix_t &B);
626 
627  /** Flattens the tensor \p B, such that each matrix, is stretched in
628  * one row, resulting with a matrix \p A. */
629  static void Flatten(Tensor_t &A, const Tensor_t &B); // size_t size, size_t nRows, size_t nCols);
630 
631  /** Transforms each row of \p B to a matrix and stores it in the
632  * tensor \p B. */
633  static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
634 
635  /** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
636  static void Rearrange(Tensor_t &out, const Tensor_t &in);
637 
638 
639  /** Backward pass for Recurrent Networks */
640  static Matrix_t & RecurrentLayerBackward(Matrix_t & state_gradients_backward, // BxH
641  Matrix_t & input_weight_gradients,
642  Matrix_t & state_weight_gradients,
643  Matrix_t & bias_gradients,
644  Matrix_t & df, //DxH
645  const Matrix_t & state, // BxH
646  const Matrix_t & weights_input, // HxD
647  const Matrix_t & weights_state, // HxH
648  const Matrix_t & input, // BxD
649  Matrix_t & input_gradient);
650 
651 
652  ///@}
653 
654  //____________________________________________________________________________
655  //
656  // Additional Arithmetic Functions
657  //____________________________________________________________________________
658 
659  /** @name Additional Arithmetic Functions
660  *
661  * Additional arithmetic on CUDA matrices used to implement the low-level
662  * interface.
663  */
664  ///@{
665 
666  /** Standard multiplication of two matrices \p A and \p B with the result being
667  * written into C.
668  */
669  static void Multiply(Matrix_t &C,
670  const Matrix_t &A,
671  const Matrix_t &B);
672  /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
673  * result being written into C.
674  */
675  static void TransposeMultiply(Matrix_t &output,
676  const Matrix_t &input,
677  const Matrix_t &Weights,
678  Scalar_t alpha = 1.0, Scalar_t beta = 0.);
679  /** In-place Hadamard (element-wise) product of matrices \p A and \p B
680  * with the result being written into \p A.
681  */
682  static void Hadamard(Tensor_t &A,
683  const Tensor_t &B);
684  static void Hadamard(Matrix_t &A,
685  const Matrix_t &B);
686  // {
687  // Tensor_t tA(A);
688  // Hadamard( tA, Tensor_t(B));
689  // }
690 
691  /** Sum columns of (m x n) matrixx \p A and write the results into the first
692  * m elements in \p A.
693  */
694  static void SumColumns(Matrix_t &B,
695  const Matrix_t &A,
696  Scalar_t alpha = 1.0, Scalar_t beta = 0.);
697 
698  /** Compute the sum of all elements in \p A */
699  static Scalar_t Sum(const Matrix_t &A);
700 
701  /** Check two matrices for equality, taking floating point arithmetic errors into account. */
702  static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
703 
704  /** Add the constant \p beta to all the elements of matrix \p A and write the
705  * result into \p A.
706  */
707  static void ConstAdd(Matrix_t &A, Scalar_t beta);
708 
709  /** Multiply the constant \p beta to all the elements of matrix \p A and write the
710  * result into \p A.
711  */
712  static void ConstMult(Matrix_t &A, Scalar_t beta);
713 
714  /** Reciprocal each element of the matrix \p A and write the result into
715  * \p A
716  */
717  static void ReciprocalElementWise(Matrix_t &A);
718 
719  /** Square each element of the matrix \p A and write the result into
720  * \p A
721  */
722  static void SquareElementWise(Matrix_t &A);
723 
724  /** Square root each element of the matrix \p A and write the result into
725  * \p A
726  */
727  static void SqrtElementWise(Matrix_t &A);
728 
729  // optimizer functions
730  static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps);
731  static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta);
732  static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta);
733 
734  // printing of tensor
735  static void PrintTensor( const Tensor_t & A, const std::string name = "Cuda-tensor", bool = false);
736 
737  ///////////////////////////////////////////////////////////////////////////////
738  /// extra functions defined only for CPU architecture !!!
739  //////////////////////////////////////////////////////////////////////////////
740 
741  /** Sum rows of (m x n) matrix \p A and write the results into the first
742  * m elements in \p B.
743  */
744  static void SumRows(Matrix_t & B, const Matrix_t & A);
745 
746 
747 };
748 
749 //____________________________________________________________________________
750 template <typename AFloat>
751 template <typename AMatrix_t>
752 void TCuda<AFloat>::CopyDiffArch(TCudaMatrix<AFloat> &B,
753  const AMatrix_t &A)
754 {
755  // copy from another architecture using the reference one
756  // this is not very efficient since creates temporary objects
757  TMatrixT<AFloat> tmp = A;
758  Copy(B, TCudaMatrix<AFloat>(tmp) );
759 }
760 
761 //____________________________________________________________________________
762 template <typename AFloat>
763 template <typename AMatrix_t>
764 void TCuda<AFloat>::CopyDiffArch(std::vector<TCudaMatrix<AFloat>> &B,
765  const std::vector<AMatrix_t> &A)
766 {
767  for (size_t i = 0; i < B.size(); ++i) {
768  CopyDiffArch(B[i], A[i]);
769  }
770 }
771 
772 template <typename AFloat>
773 void TCuda<AFloat>::PrintTensor(const typename TCuda<AFloat>::Tensor_t & A, const std::string name, bool )
774 {
775  std::cout << name << " size = " << A.GetSize() << " shape = { ";
776  auto shape = A.GetShape();
777  for (size_t k = 0; k < shape.size()-1; ++k)
778  std::cout << shape[k] << " , ";
779  std::cout << shape.back() << " } ";
780  std::cout << " strides = { ";
781  auto strides = A.GetStrides();
782  for (size_t k = 0; k < strides.size()-1; ++k)
783  std::cout << strides[k] << " , ";
784  std::cout << strides.back() << " }\n ";
785 
786  if (A.GetShape().size() == 2 ) {
787  for (size_t i = 0; i < A.GetShape()[0]; ++i) {
788  std::cout << "{ ";
789  for (size_t j = 0; j < A.GetShape()[1]; ++j) {
790  std::cout << A(i,j) << " ";
791  }
792  std::cout << " } " << std::endl;
793  }
794  } else if (A.GetShape().size() == 3 ) {
795  for (size_t i = 0; i < A.GetFirstSize(); ++i) {
796  std::cout << "{ ";
797  for (size_t j = 0; j < A.GetHSize(); ++j) {
798  std::cout << "{ ";
799  for (size_t k = 0; k < A.GetWSize(); ++k) {
800  std::cout << A(i,j,k) << " ";
801  }
802  std::cout << " } " << std::endl;
803  }
804  std::cout << " } " << std::endl;
805  }
806  }
807  else {
808  for (size_t l = 0; l < A.GetSize(); ++l) {
809  std::cout << A.GetData()[l] << " ";
810  }
811  std::cout << "\n";
812  }
813 }
814 
815 
816 } // namespace DNN
817 } // namespace TMVA
818 
819 #endif