Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
TCudnn.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Joana Niermann 23/07/19
3 
4 /*************************************************************************
5  * Copyright (C) 2019, Joana Niermann *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////
13 // Definition of the TCudnn architecture class, which provides //
14 // a wrapping of the low-level functionality for neural networks //
15 // in the cuDNN library. //
16 ///////////////////////////////////////////////////////////////////
17 
18 #ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19 #define TMVA_DNN_ARCHITECTURES_CUDNN
20 
21 #include "RConfigure.h" // for definition of R__HAS_CUDNN
22 
23 #ifndef R__HAS_CUDNN
24 #error This file can be compiled only when cudnn is available in ROOT
25 #else
26 
27 #include "TMVA/DNN/Functions.h"
29 //#include "TMVA/DNN/CNN/Descriptors.h"
31 #include "TMVA/DNN/CNN/ConvLayer.h"
33 
34 #include "cudnn.h"
35 #include "Cuda/CudaBuffers.h"
36 #include "Cuda/CudaTensor.h"
38 #include <utility>
39 #include <vector>
40 
42 
43 class TRandom;
44 
45 namespace TMVA
46 {
47 namespace DNN
48 {
49 
50 struct TCudnnEmptyDescriptor {};
51 
52 
53 /** The TCudnn architecture class.
54  *
55  * Low-level interface class for CUDA computing architectures using the cuDNN
56  * library as backend. Contains as public types the declaration of the scalar,
57  * matrix and buffer types for this architecture, as well as the remaining
58  * functions in the low-level interface in the form of static members.
59  */
60 template<typename AFloat = Float_t>
61 class TCudnn
62 {
63 private:
64  static TRandom * fgRandomGen;
65 public:
66 
67  using Scalar_t = AFloat;
68  using Matrix_t = TCudaTensor<AFloat>;
69  using Tensor_t = TCudaTensor<AFloat>;
70  using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
71  using HostBuffer_t = TCudaHostBuffer<AFloat>;
72 
73  // The descriptors for the (tensor) data are held by the data classes (CudaTensor)
74  using ActivationDescriptor_t = cudnnActivationDescriptor_t;
75  using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
76  using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
77  using FilterDescriptor_t = cudnnFilterDescriptor_t;
78  //using OpTensorDescriptor_t = cudnnOpTensorDescriptor_t;
79  using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
80  //using ReductionDescriptor_t = cudnnReduceTensorDescriptor_t;
81  using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
82  using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
83  using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
84  using AlgorithmDataType_t = cudnnDataType_t;
85  using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
86  using TensorDescriptor_t = cudnnTensorDescriptor_t;
87 
88  using EmptyDescriptor_t = TCudnnEmptyDescriptor; // Used if a descriptor is not needed in a class
89 
90  using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
91  using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
92  //using BNormWorkspace_t = CNN::TCNNWorkspace<BNormLayer_t>;*/
93  using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
94  using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
95  using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
96  using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
97  using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
98  using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
99 
100  // template <typename AFloat>
101  // using ConvDescriptors_t = CNN::TCNNDescriptors<CNN::TConvLayer<TCudnn<AFloat>>>;
102 
103  // convolution options
104  // default is -1 (left to cudnn)
105  struct CNNOptions {
106 
107  static int ConvFwdAlgorithm;
108  static int ConvBwdDataAlgorithm;
109  static int ConvBwdFilterAlgorithm;
110  // default is 0 (left to cudnn : a value -1 will indicate to not use any space)
111  static Long_t ConvMaxWorkspaceSize;
112  }; // namespace DNN
113 
114  static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::RowMajor; }
115 
116 
117  static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
118  return Tensor_t( {n,c,h,w}, GetTensorLayout(), 0, 0);
119  }
120 
121  static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
122  return Tensor_t( buffer, {n,c,h,w}, GetTensorLayout(), 0, 0);
123  }
124 
125  // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
126  // this function is used by the optimizers to stgore intermidiate weights representations
127  static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
128  if (!newWeights.empty()) newWeights.clear();
129  size_t n = weights.size();
130  for (size_t i = 0; i < n; ++i)
131  newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
132  }
133  //____________________________________________________________________________
134  //
135  // Architecture Initialization
136  //____________________________________________________________________________
137 
138  static void InitializeBNormDescriptors(TDescriptors * & descriptors,
139  BNormLayer_t *L = nullptr);
140 
141  static void InitializeConvDescriptors(TDescriptors * & descriptors,
142  ConvLayer_t *L = nullptr);
143 
144  static void InitializePoolDescriptors(TDescriptors * & descriptors,
145  PoolingLayer_t *L = nullptr);
146 
147  static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc, double coef = 0.0);
148 
149  static void ReleaseConvDescriptors(TDescriptors * descriptors );
150  static void ReleasePoolDescriptors(TDescriptors * descriptors );
151  static void ReleaseBNormDescriptors(TDescriptors * descriptors );
152  static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {} // Does nothing
153  static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
154  static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
155  static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
156  static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
157  static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
158  static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
159 
160 
161  static void InitializeConvWorkspace(TWorkspace * & workspace,
162  TDescriptors * & descriptors,
163  const DNN::CNN::TConvParams & params,
164  ConvLayer_t *L = nullptr);
165  static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
166  TDescriptors * & descriptors,
167  const DNN::CNN::TConvParams & params,
168  PoolingLayer_t *L = nullptr);
169 
170  static void FreeConvWorkspace(TWorkspace * workspace, ConvLayer_t *L = nullptr);
171  static void FreePoolDropoutWorkspace(TWorkspace * workspace, PoolingLayer_t *L = nullptr);
172  //____________________________________________________________________________
173  //
174  // Propagation
175  //____________________________________________________________________________
176 
177  /** @name Forward Propagation
178  * Low-level functions required for the forward propagation of activations
179  * through the network.
180  */
181  ///@{
182  /** Matrix-multiply \p input with the transpose of \pweights and
183  * write the results into \p output. */
184  static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights);
185 
186  /** Add the vectors biases row-wise to the matrix output */
187  static void AddRowWise(Tensor_t &output,const Matrix_t &biases);
188 
189  /** @name Backward Propagation (Dense Layers)
190  * Low-level functions required for the forward propagation of activations
191  * through the network.
192  */
193  ///@{
194  /** Perform the complete backward propagation step. If the provided
195  * \p activationGradientsBackward matrix is not empty, compute the
196  * gradients of the objective function with respect to the activations
197  * of the previous layer (backward direction).
198  * Also compute the weight and the bias gradients. Modifies the values
199  * in \p df and thus produces only a valid result, if it is applied the
200  * first time after the corresponding forward propagation has been per-
201  * formed. */
202  static void Backward(Tensor_t & activationGradientsBackward,
203  Matrix_t & weightGradients,
204  Matrix_t & biasGradients,
205  Tensor_t & df,
206  const Tensor_t & activationGradients,
207  const Matrix_t & weights,
208  const Tensor_t & activationBackward);
209 
210  /** Above functions extended to vectors */
211  static void ScaleAdd(Tensor_t & A, const Tensor_t & B,
212  Scalar_t alpha = 1.0,
213  Scalar_t beta = 1.0);
214 
215  /** Deep copy from B to A. */
216  static void Copy(Tensor_t & A, const Tensor_t & B);
217 
218  // copy from another tensor
219  template<typename ATensor_t>
220  static void CopyDiffArch(Tensor_t & A,
221  const ATensor_t & B);
222 
223  template <typename ATensor_t>
224  static void CopyWeightsDiffArch(Tensor_t &A, const ATensor_t &B);
225 
226  //template<>
227  static void CopyDiffArch(Tensor_t A, const Tensor_t & B ) { Copy(A,B); }
228 
229  // copy from vector of matrices of different types
230  template<typename AMatrix_t>
231  static void CopyDiffArch(std::vector<Tensor_t> & A,
232  const std::vector<AMatrix_t> & B);
233 
234 
235  //____________________________________________________________________________
236  //
237  // Activation Functions
238  //____________________________________________________________________________
239 
240  /** @name Activation Functions
241  * For each activation function, the low-level interface contains two routines.
242  * One that applies the acitvation function to a matrix and one that evaluate
243  * the derivatives of the activation function at the elements of a given matrix
244  * and writes the results into the result matrix.
245  */
246  ///@{
247  static void Identity(Tensor_t & X) {}
248  static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
249  Tensor_t & Y, Tensor_t & dY,
250  ActivationDescriptor_t activationDescr,
251  const AFloat alpha = 1,
252  const AFloat beta = 1) {}
253 
254  static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
255  const ActivationDescriptor_t activationDescr,
256  const double coef = 0.0, const AFloat alpha = 1,
257  const AFloat beta = 0);
258 
259  // same as above but using different input/output tensors
260  static void ActivationFunctionForward(Tensor_t &Y, const Tensor_t & X, EActivationFunction activFunct,
261  const ActivationDescriptor_t activationDescr, const double coef = 0.0,
262  const AFloat alpha = 1, const AFloat beta = 0);
263 
264  /** Computes the gradient of the activation function */
265  static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
266  const Tensor_t & dY, const Tensor_t & X,
267  EActivationFunction activFunct,
268  const ActivationDescriptor_t activationDescr,
269  const AFloat alpha = 1,
270  const AFloat beta = 0);
271 
272  //
273  // No cudnn implementation for the following activation functions
274  //
275  //static void SymmetricRelu(Tensor_t & B);
276  static void SymmetricReluDerivative(Tensor_t & B,
277  const Tensor_t & A) {}
278 
279  //static void SoftSign(Tensor_t & B);
280  static void SoftSignDerivative(Tensor_t & B,
281  const Tensor_t & A) {}
282 
283  //static void Gauss(Tensor_t & B);
284  static void GaussDerivative(Tensor_t & B,
285  const Tensor_t & A) {}
286  ///@}
287 
288  //____________________________________________________________________________
289  //
290  // Loss Functions
291  //____________________________________________________________________________
292 
293  /** @name Loss Functions
294  * Loss functions compute a scalar value given the \p output of the network
295  * for a given training input and the expected network prediction \p Y that
296  * quantifies the quality of the prediction. For each function also a routing
297  * that computes the gradients (suffixed by Gradients) must be provided for
298  * the starting of the backpropagation algorithm.
299  */
300  ///@{
301 
302  static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
303  const Matrix_t &weights);
304  static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
305  const Matrix_t &output, const Matrix_t &weights);
306 
307  /** Sigmoid transformation is implicitly applied, thus \p output should
308  * hold the linear activations of the last layer in the net. */
309  static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
310  const Matrix_t &weights);
311 
312  static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
313  const Matrix_t &output, const Matrix_t &weights);
314 
315  /** Softmax transformation is implicitly applied, thus \p output should
316  * hold the linear activations of the last layer in the net. */
317  static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
318  const Matrix_t &weights);
319  static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
320  const Matrix_t &output, const Matrix_t &weights);
321  ///@}
322 
323  //____________________________________________________________________________
324  //
325  // Output Functions
326  //____________________________________________________________________________
327 
328  /** @name Output Functions
329  * Output functions transform the activations \p output of the
330  * output layer in the network to a valid prediction \p YHat for
331  * the desired usage of the network, e.g. the identity function
332  * for regression or the sigmoid transformation for two-class
333  * classification.
334  */
335  ///@{
336  static void Sigmoid(Matrix_t &YHat,
337  const Matrix_t & );
338  static void Softmax(Matrix_t &YHat,
339  const Matrix_t & );
340  ///@}
341 
342 
343 
344  //____________________________________________________________________________
345  //
346  // Dropout
347  //____________________________________________________________________________
348 
349  /** @name Dropout
350  */
351  ///@{
352 
353  /** Apply dropout with activation probability \p p to the given
354  * tensor \p A and scale the result by reciprocal of \p p. */
355  static void DropoutForward(Tensor_t & A,
356  TDescriptors * descriptors,
357  TWorkspace * workspace,
358  Scalar_t p);
359 
360  static void DropoutBackward(Tensor_t & A,
361  TDescriptors * descriptors,
362  TWorkspace * workspace);
363 
364  ///@}
365 
366  //____________________________________________________________________________
367  //
368  // Batch Normalization
369  //____________________________________________________________________________
370 
371  /** @name Batch Normalization Layer Propagation
372  */
373  ///@{
374 
375  /** The input from each batch are normalized during training to have zero mean and unit variance
376  * and they are then scaled by two parameter, different for each input variable:
377  * - a scale factor \gamma gamma
378  * - an offset \beta beta */
379 
380  static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
381  Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
382  Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
383  Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
384 
385  /** During inference the inputs are not normalized using the batch mean but the previously computed
386  * at running mean and variance */
387 
388  static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
389  Tensor_t &y, const Matrix_t &runningMeans,
390  const Matrix_t &runningVars, Scalar_t epsilon,
391  const TensorDescriptor_t &);
392 
393  static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
394  Matrix_t &gamma, // Matrix_t &beta, (not needed)
395  Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
396  const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
397 
398  //____________________________________________________________________________
399  //
400  // Regularization
401  //____________________________________________________________________________
402 
403  /** @name Regularization
404  * For each regularization type two functions are required, one named
405  * <tt><Type>Regularization</tt> that evaluates the corresponding
406  * regularization functional for a given weight matrix and the
407  * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
408  * component in the gradients to the provided matrix.
409  */
410 
411  static Scalar_t L1Regularization(const Matrix_t &W)
412  {
413  TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
414  return TCuda<AFloat>::L1Regularization(mW);
415  }
416  static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
417  {
418  TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
419  TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
420  return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
421  }
422 
423  static Scalar_t L2Regularization(const Matrix_t &W)
424  {
425  TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
426  return TCuda<AFloat>::L2Regularization(mW);
427  }
428  static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
429  {
430  TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
431  TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
432  return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
433  }
434  ///@}
435 
436  //____________________________________________________________________________
437  //
438  // Initialization
439  //____________________________________________________________________________
440 
441  /** @name Initialization
442  * For each initialization method, one function in the low-level interface
443  * is provided. The naming scheme is <p>Initialize<Type></p> for a given
444  * initialization method Type.
445  */
446  ///@{
447 
448  static void InitializeGauss(Matrix_t &A);
449  static void InitializeUniform(Matrix_t &A);
450  static void InitializeIdentity(Matrix_t &A);
451  static void InitializeZero(Matrix_t &A);
452  static void InitializeGlorotNormal(Matrix_t &A);
453  static void InitializeGlorotUniform(Matrix_t &A);
454 
455  // return static instance of random generator used for initialization
456  // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
457  static TRandom &GetRandomGenerator();
458  // set random seed for the static geenrator
459  // if the static geneerator does not exists it is created
460  static void SetRandomSeed(size_t seed);
461  ///@}
462 
463  //____________________________________________________________________________
464  //
465  // Dropout
466  //____________________________________________________________________________
467 
468  /** @name Dropout
469  */
470  ///@{
471 
472  /** Apply dropout with activation probability \p p to the given
473  * tensor \p A and scale the result by reciprocal of \p p. */
474  static void Dropout(Tensor_t &A, Scalar_t p) {}
475 
476  ///@}
477 
478  //____________________________________________________________________________
479  //
480  // Convolutional Layer Propagation
481  //____________________________________________________________________________
482 
483  /** @name Forward Propagation in Convolutional Layer
484  */
485  ///@{
486 
487  /** Add the biases in the Convolutional Layer. */
488  static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
489  ///@}
490 
491  /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
492  static void PrepareInternals(Tensor_t &) {}
493 
494  /** Forward propagation in the Convolutional layer */
495  static void ConvLayerForward(Tensor_t &output,
496  Tensor_t &inputActivationFunc, // this is output conv w/o activ func.
497  const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases,
498  const DNN::CNN::TConvParams &params, EActivationFunction activFunc,
499  Tensor_t & /* inputPrime */, const ConvDescriptors_t &descriptors,
500  ConvWorkspace_t &workspace);
501  // const AFloat alpha = 1,
502  // const AFloat beta = 1);
503 
504  /** @name Backward Propagation in Convolutional Layer
505  */
506  ///@{
507 
508  /** Perform the complete backward propagation step in a Convolutional Layer.
509  * If the provided \p activationGradientsBackward matrix is not empty, compute the
510  * gradients of the objective function with respect to the activations
511  * of the previous layer (backward direction).
512  * Also compute the weight and the bias gradients. Modifies the values
513  * in \p df and thus produces only a valid result, if it is applied the
514  * first time after the corresponding forward propagation has been per-
515  * formed. */
516  static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
517  Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
518  const Matrix_t &weights, const Tensor_t &activationBackward,
519  const Tensor_t &outputTensor, EActivationFunction activFunc,
520  const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace, size_t /*batchSize*/,
521  size_t /*inputHeight*/, size_t /*inputWidth*/, size_t /*depth*/, size_t /*height*/,
522  size_t /*width*/, size_t /*filterDepth*/, size_t /*filterHeight*/,
523  size_t /*filterWidth*/, size_t /*nLocalViews*/);
524 
525  ///@}
526 
527  //____________________________________________________________________________
528  //
529  // Max Pooling Layer Propagation
530  //____________________________________________________________________________
531  /** @name Forward Propagation in Max Pooling Layer
532  */
533  ///@{
534 
535  /** Downsample the matrix \p C to the matrix \p A, using max
536  * operation, such that the winning indices are stored in matrix
537  * \p B. No winning indices needed for cuDNN. */
538  static void Downsample(Tensor_t &A, Tensor_t & /*B*/, const Tensor_t &C, const PoolingDescriptors_t &descriptors,
539  PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
540  size_t fltWidth, size_t strideRows, size_t strideCols);
541 
542  ///@}
543 
544  /** @name Backward Propagation in Max Pooling Layer
545  */
546  ///@{
547  /** Perform the complete backward propagation step in a Pooling Layer. Based on the
548  * input to and output from the MaxPoolLayer, the gradients for the winning pixels
549  * are computed. */
550  static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
551  const Tensor_t & /*indexMatrix*/, const Tensor_t &inputActivation,
552  const Tensor_t &outputTensor, const PoolingDescriptors_t &descriptors,
553  PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
554  size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews);
555 
556  ///@}
557 
558  //____________________________________________________________________________
559  //
560  // Reshape Layer Propagation
561  //____________________________________________________________________________
562  /** @name Forward and Backward Propagation in Reshape Layer
563  */
564  ///@{
565 
566  /** Transform the matrix \p B to a matrix with different dimensions \p A */
567  // static void Reshape(Matrix_t &A, const Matrix_t &B);
568 
569  /** Flattens the tensor \p B, such that each matrix, is stretched in
570  * one row, resulting with a matrix \p A. */
571  static void Flatten(Tensor_t &A, const Tensor_t &B);
572 
573  /** Transforms each row of \p B to a matrix and stores it in the
574  * tensor \p B. */
575  static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
576 
577  /** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
578  static void Rearrange(Tensor_t &out, const Tensor_t &in) { TCuda<AFloat>::Rearrange(out, in); }
579 
580  /** Backward pass for Recurrent Networks */
581  static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
582  Matrix_t & /* input_weight_gradients */, Matrix_t &/* state_weight_gradients */,
583  Matrix_t &/* bias_gradients */,
584  Matrix_t &/* df */, // DxH
585  const Matrix_t &/* state */, // BxH
586  const Matrix_t &/* weights_input */, // HxD
587  const Matrix_t &/* weights_state */, // HxH
588  const Matrix_t &/* input */, // BxD
589  Matrix_t &/* input_gradient */)
590  {
591  return state_gradients_backward;
592  }
593 
594  ///@}
595 
596  //____________________________________________________________________________
597  //
598  // Additional Arithmetic Functions
599  //____________________________________________________________________________
600 
601  /** @name Additional Arithmetic Functions
602  *
603  * Additional arithmetic on CUDA matrices used to implement the low-level
604  * interface.
605  */
606 
607  /** In-place Hadamard (element-wise) product of matrices \p A and \p B
608  * with the result being written into \p A.
609  */
610  static void Hadamard(Tensor_t &A, const Tensor_t &B)
611  {
612  TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
613  TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
614  assert(A.GetSize() == B.GetSize());
615  TCuda<AFloat>::Hadamard(tmpA, tmpB);
616  }
617  // static void Hadamard(Matrix_t &A,
618  // const Matrix_t &B);*/
619  // {
620  // Tensor_t tA(A);
621  // Hadamard( tA, Tensor_t(B));
622  // }
623 
624 
625  /** Compute the sum of all elements in \p A */
626  static Scalar_t Sum(const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
627 
628  /** Check two matrices for equality, taking floating point arithmetic errors into account. */
629  //static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
630 
631  /** Add the constant \p beta to all the elements of matrix \p A and write the
632  * result into \p A.
633  */
634  static void ConstAdd(Matrix_t &A, Scalar_t beta) {
635  TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
636  TCuda<AFloat>::ConstAdd(tmp,beta);
637  }
638 
639  /** Multiply the constant \p beta to all the elements of matrix \p A and write the
640  * result into \p A.
641  */
642  static void ConstMult(Matrix_t &A, Scalar_t beta) {
643  TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
644  TCuda<AFloat>::ConstMult(tmp,beta);
645  }
646 
647  /** Reciprocal each element of the matrix \p A and write the result into
648  * \p A
649  */
650  static void ReciprocalElementWise(Matrix_t &A) {
651  TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
652  TCuda<AFloat>::ReciprocalElementWise(tmp);
653  }
654 
655  /** Square each element of the matrix \p A and write the result into
656  * \p A
657  */
658  static void SquareElementWise(Matrix_t &A) {
659  TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
660  TCuda<AFloat>::SquareElementWise(tmp);
661  }
662 
663  /** Square root each element of the matrix \p A and write the result into
664  * \p A
665  */
666  //static void SqrtElementWise(Matrix_t &A, Scalar_t alpha = 1, Scalar_t beta = 0, Scalar_t gamma = 0) {
667  static void SqrtElementWise(Matrix_t &A) {
668  TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
669  TCuda<AFloat>::SqrtElementWise(tmp);
670  }
671 
672  // optimizer functions
673  static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
674  TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
675  TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
676  TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
677  TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
678  }
679  static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
680  TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
681  TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
682  TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
683  }
684  static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
685  TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
686  TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
687  TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
688  }
689 
690  // printing of tensor
691  static void PrintTensor( const Tensor_t & A, const std::string name = "tensor", bool = false);
692 
693 
694 
695  ///////////////////////////////////////////////////////////////////////////////
696  /// extra functions defined only for CPU architecture !!!
697  //////////////////////////////////////////////////////////////////////////////
698 
699  /** Sum rows of (m x n) matrix \p A and write the results into the first
700  * m elements in \p B.
701  */
702  static void SumRows(Matrix_t & B, const Matrix_t & A);
703 
704 
705 };
706 
707 
708 //____________________________________________________________________________
709 template <typename AFloat>
710 template <typename ATensor>
711 void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
712  const ATensor &A)
713 {
714 
715  // should add static assert that A has not to be same type as B
716 
717  // this copying tensors from different architectures
718  if (B.GetLayout() == GetTensorLayout() ) {
719  assert(B.GetShape().size() == 4);
720  for (size_t i = 0; i < A.GetFirstSize(); ++i) {
721  TMatrixT<AFloat> matIn = A.At(i).GetMatrix(); // this convert tensor (B,D,HW) in (D,HW)i -> (D,HW)i
722  // TMAtrix has the correct layout (row-wise) no need to traspose in this case
723  TCudaTensor<AFloat> tmpOut = B.At(i); // matrix (D,HW)
724  // copy will copy the buffer
725  TCudaTensor<AFloat> tmpIn(matIn.GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
726  Copy(tmpOut, tmpIn);
727  }
728  } else {
729  // case of same layout (column major)
730  TMatrixT<AFloat> tmp = A;
731  TCudaMatrix<AFloat> tmp2(tmp);
732  TCudaTensor<AFloat> tA(tmp2);
733  Copy(B, tA);
734  }
735 }
736 
737 //____________________________________________________________________________
738 template <typename AFloat>
739 template <typename AMatrix>
740 void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B, const AMatrix &A)
741 {
742  // copy from another architecture using the reference one
743  // this is not very efficient since creates temporary objects
744  TMatrixT<AFloat> tmp = A; // .GetMatrix();
745  // we need to traspose for different layout
746  if (B.GetLayout() == GetTensorLayout() ) {
747  // this is for CNN weights that are in row-major formats
748  assert(B.GetShape().size() == 4); // weights shape should be 4
749  tmp.T();
750  }
751  TCudaMatrix<AFloat> tmp2(tmp);
752  TCudaTensor<AFloat> tA(tmp2);
753  Copy(B, tA);
754 }
755 
756 //____________________________________________________________________________
757 template <typename AFloat>
758 template <typename AMatrix_t>
759 void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
760  const std::vector<AMatrix_t> &A)
761 {
762  for (size_t i = 0; i < B.size(); ++i) {
763  CopyWeightsDiffArch(B[i], A[i]);
764  }
765 }
766 
767 template <typename AFloat>
768 void TCudnn<AFloat>::PrintTensor(const typename TCudnn<AFloat>::Tensor_t & A, const std::string name, bool truncate )
769 {
770  std::cout << name << " size = " << A.GetSize() << " shape = { ";
771  auto shape = A.GetShape();
772  for (size_t k = 0; k < shape.size()-1; ++k)
773  std::cout << shape[k] << " , ";
774  std::cout << shape.back() << " } ";
775  std::cout << " strides = { ";
776  auto strides = A.GetStrides();
777  for (size_t k = 0; k < strides.size()-1; ++k)
778  std::cout << strides[k] << " , ";
779  std::cout << strides.back() << " }\n ";
780 
781  if (A.GetShape().size() == 2 ) {
782  for (size_t i = 0; i < A.GetShape()[0]; ++i) {
783  std::cout << "{ ";
784  size_t n = A.GetShape()[1];
785  if (truncate) n = std::min(n,size_t(10));
786  for (size_t j = 0; j < n; ++j) {
787  std::cout << A(i,j) << " ";
788 
789  }
790  if (truncate && n < A.GetShape()[1]) std::cout << " ...... ";
791  std::cout << " } " << std::endl;
792  }
793  } else if (A.GetShape().size() == 3 ) {
794  for (size_t i = 0; i < A.GetFirstSize(); ++i) {
795  std::cout << "{ ";
796  for (size_t j = 0; j < A.GetHSize(); ++j) {
797  std::cout << "{ ";
798  size_t n = A.GetWSize();
799  if (truncate) n = std::min(n,size_t(10));
800  for (size_t k = 0; k < n; ++k) {
801  std::cout << A(i,j,k) << " ";
802  }
803  if (truncate && n < A.GetWSize()) std::cout << " ...... ";
804  std::cout << " } " << std::endl;
805  }
806  std::cout << " } " << std::endl;
807  }
808  } else if (A.GetShape().size() == 4 ) {
809  for (size_t i = 0; i < A.GetShape()[0]; ++i) {
810  std::cout << "{ ";
811  for (size_t j = 0; j < A.GetShape()[1]; ++j) {
812  std::cout << "{ ";
813  for (size_t k = 0; k < A.GetShape()[2]; ++k) {
814  size_t n = A.GetShape()[3];
815  if (truncate) n = std::min(n,size_t(10));
816  for (size_t l = 0; l < n; ++l) {
817  std::cout << A(i,j,k,l) << " ";
818  }
819  if (truncate && n < A.GetShape()[3]) std::cout << " ...... ";
820  std::cout << " } " << std::endl;
821  }
822  std::cout << " } " << std::endl;
823  }
824  std::cout << " } " << std::endl;
825  }
826  }
827  else {
828  for (size_t l = 0; l < A.GetSize(); ++l) {
829  std::cout << A.GetData()[l] << " ";
830  }
831  std::cout << "\n";
832  }
833 }
834 
835 // initialize the CNN options
836 // possible options for forward (from 0 to 7)
837 //
838 // 0 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
839 // 1 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
840 // 6 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
841 // 7 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; (lots of memory)
842 
843 // for backward data (from 0 to 5)
844 // 1 : CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
845 // 5 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
846 
847 template <typename AFloat>
848 int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
849 template <typename AFloat>
850 int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
851 template <typename AFloat>
852 int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
853 template <typename AFloat>
854 Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1; // -1 let use Cudnn defaults
855 
856 } // namespace DNN
857 } // namespace TMVA
858 
859 #endif
860 #endif