18 #ifndef TMVA_DNN_ARCHITECTURES_CUDA
19 #define TMVA_DNN_ARCHITECTURES_CUDA
42 struct CudaActivationDescriptor {};
43 struct CudaFilterDescriptor {};
44 struct CudaConvolutionDescriptor {};
45 struct CudaDropoutDescriptor {};
46 struct CudaPoolingDescriptor {};
47 struct CudaConvolutionFwdAlgo {};
48 struct CudaConvolutionBwdDataAlgo {};
49 struct CudaConvolutionBwdFilterAlgo {};
50 struct CudaDataType {};
53 struct CudaEmptyDescriptor {};
62 template<
typename AReal = Float_t>
66 static TRandom * fgRandomGen;
70 using Scalar_t = AFloat;
72 using Matrix_t = TCudaMatrix<AFloat>;
73 using Tensor_t = TCudaTensor<AFloat>;
74 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
75 using HostBuffer_t = TCudaHostBuffer<AFloat>;
77 using ActivationDescriptor_t = CudaActivationDescriptor;
78 using ConvolutionDescriptor_t = CudaConvolutionDescriptor;
79 using FilterDescriptor_t = CudaFilterDescriptor;
80 using DropoutDescriptor_t = CudaDropoutDescriptor;
82 using PoolingDescriptor_t = CudaPoolingDescriptor;
83 using TensorDescriptor_t = DummyType;
85 using AlgorithmForward_t = CudaConvolutionFwdAlgo;
86 using AlgorithmBackward_t = CudaConvolutionBwdDataAlgo;
87 using AlgorithmHelper_t = CudaConvolutionBwdFilterAlgo;
88 using AlgorithmDataType_t = CudaDataType;
89 using ReduceTensorDescriptor_t = DummyType;
91 using EmptyDescriptor_t = CudaEmptyDescriptor;
93 using BNormLayer_t = TBatchNormLayer<TCuda<AReal>>;
94 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
96 using ConvLayer_t = CNN::TConvLayer<TCuda<AReal>>;
97 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
98 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
99 using PoolingLayer_t = CNN::TMaxPoolLayer<TCuda<AReal>>;
100 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
101 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
103 static TMVA::Experimental::MemoryLayout GetTensorLayout() {
return TMVA::Experimental::MemoryLayout::ColumnMajor; }
105 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
106 return Tensor_t( {c,h*w,n}, GetTensorLayout());
108 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
109 return Tensor_t( buffer, {c,h*w, n}, GetTensorLayout(), 0, 0);
118 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
119 if (!newWeights.empty()) newWeights.clear();
120 size_t n = weights.size();
121 for (
size_t i = 0; i < n; ++i)
122 newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
132 static void InitializeBNormDescriptors(TDescriptors * & ,
134 Error(
"InitializeBNormDescriptrs",
"Batch normalization on GPU is supported only with Cudnn");
137 static void InitializeConvDescriptors(TDescriptors *& , ConvLayer_t * ) {}
139 static void InitializePoolDescriptors(TDescriptors *& , PoolingLayer_t * ) {}
141 static void InitializeActivationDescriptor(ActivationDescriptor_t &, EActivationFunction ,
double = 0.0) {}
144 static void ReleaseConvDescriptors(TDescriptors * & ) {}
145 static void ReleasePoolDescriptors(TDescriptors * & ) {}
146 static void ReleaseBNormDescriptors(TDescriptors *& ) {}
148 static void InitializeConvWorkspace(TWorkspace * & ,
150 const DNN::CNN::TConvParams & ,
152 static void InitializePoolDropoutWorkspace(TWorkspace * & ,
154 const DNN::CNN::TConvParams & ,
157 static void ReleaseDescriptor(ActivationDescriptor_t & ) {}
159 static void FreeConvWorkspace(TWorkspace * & , ConvLayer_t *) {}
160 static void FreePoolDropoutWorkspace(TWorkspace * & , PoolingLayer_t *) {}
175 static void MultiplyTranspose(Matrix_t &output,
const Matrix_t &input,
const Matrix_t &weights);
177 static void MultiplyTranspose(Tensor_t &output,
const Tensor_t &input,
const Matrix_t &weights) {
178 Matrix_t output_matrix = output.GetMatrix();
179 MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
184 static void AddRowWise(Matrix_t &output,
const Matrix_t &biases);
186 static void AddRowWise(Tensor_t &output,
const Matrix_t &biases) {
187 Matrix_t output_matrix = output.GetMatrix();
188 AddRowWise(output_matrix, biases);
205 static void Backward(Tensor_t & activationGradientsBackward,
206 Matrix_t & weightGradients,
207 Matrix_t & biasGradients,
209 const Tensor_t & activationGradients,
210 const Matrix_t & weights,
211 const Tensor_t & activationBackward);
216 static void ScaleAdd(Matrix_t & A,
218 Scalar_t beta = 1.0);
220 static void Copy(Matrix_t & B,
224 template<
typename AMatrix_t>
225 static void CopyDiffArch(Matrix_t & B,
const AMatrix_t & A);
229 static void ScaleAdd(Tensor_t & A,
231 Scalar_t beta = 1.0);
233 static void Copy(Tensor_t & A,
237 template<
typename ATensor_t>
238 static void CopyDiffArch(Tensor_t & A,
239 const ATensor_t & B);
242 template<
typename AMatrix_t>
243 static void CopyDiffArch(std::vector<Matrix_t> & A,
244 const std::vector<AMatrix_t> & B);
266 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
267 const ActivationDescriptor_t activationDescr,
268 const double coef = 0.0,
const AFloat alpha = 1,
269 const AFloat beta = 0);
272 static void ActivationFunctionBackward(Tensor_t & dX,
const Tensor_t & Y,
273 const Tensor_t & dY,
const Tensor_t & X,
274 EActivationFunction activFunct,
275 const ActivationDescriptor_t activationDescr,
276 const AFloat alpha = 1,
277 const AFloat beta = 0);
279 static void IdentityDerivative(Tensor_t & B,
282 static void Relu(Tensor_t & B);
283 static void ReluDerivative(Tensor_t & B,
286 static void Sigmoid(Tensor_t & B);
287 static void SigmoidDerivative(Tensor_t & B,
290 static void Tanh(Tensor_t & B);
291 static void TanhDerivative(Tensor_t & B,
294 static void SymmetricRelu(Tensor_t & B);
295 static void SymmetricReluDerivative(Tensor_t & B,
298 static void SoftSign(Tensor_t & B);
299 static void SoftSignDerivative(Tensor_t & B,
302 static void Gauss(Tensor_t & B);
303 static void GaussDerivative(Tensor_t & B,
321 static Scalar_t MeanSquaredError(
const Matrix_t &Y,
const Matrix_t &output,
322 const Matrix_t &weights);
323 static void MeanSquaredErrorGradients(Matrix_t &dY,
const Matrix_t &Y,
324 const Matrix_t &output,
const Matrix_t &weights);
328 static Scalar_t CrossEntropy(
const Matrix_t &Y,
const Matrix_t &output,
329 const Matrix_t &weights);
331 static void CrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
332 const Matrix_t &output,
const Matrix_t &weights);
336 static Scalar_t SoftmaxCrossEntropy(
const Matrix_t &Y,
const Matrix_t &output,
337 const Matrix_t &weights);
338 static void SoftmaxCrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
339 const Matrix_t &output,
const Matrix_t &weights);
355 static void Sigmoid(Matrix_t &YHat,
357 static void Softmax(Matrix_t &YHat,
375 static Scalar_t L1Regularization(
const Matrix_t & W);
376 static void AddL1RegularizationGradients(Matrix_t & A,
378 Scalar_t weightDecay);
380 static Scalar_t L2Regularization(
const Matrix_t & W);
381 static void AddL2RegularizationGradients(Matrix_t & A,
383 Scalar_t weightDecay);
398 static void InitializeGauss(Matrix_t & A);
399 static void InitializeUniform(Matrix_t & A);
400 static void InitializeIdentity(Matrix_t & A);
401 static void InitializeZero(Matrix_t & A);
402 static void InitializeGlorotNormal(Matrix_t & A);
403 static void InitializeGlorotUniform(Matrix_t & A);
407 static TRandom & GetRandomGenerator();
410 static void SetRandomSeed(
size_t seed);
424 static void DropoutForward(Tensor_t & A,
425 TDescriptors * descriptors,
426 TWorkspace * workspace,
429 static void DropoutForward(Matrix_t & A, Scalar_t p) {
431 DropoutForward( tA, static_cast<TDescriptors *> (
nullptr), static_cast<TWorkspace *> (
nullptr), p );
434 static void DropoutBackward(Tensor_t & ,
453 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
454 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
455 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
456 Scalar_t epsilon,
const TensorDescriptor_t &bnParDescriptor);
461 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta, Tensor_t &y,
462 const Matrix_t &runningMeans,
const Matrix_t &runningVars,
463 Scalar_t epsilon,
const TensorDescriptor_t &);
465 static void BatchNormLayerBackward(
int axis,
const Tensor_t &x,
const Tensor_t &dy, Tensor_t &dx,
467 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
const Matrix_t &variance,
468 const Matrix_t &iVariance, Scalar_t epsilon,
const TensorDescriptor_t &);
480 static size_t calculateDimension(
size_t imgDim,
size_t fltDim,
size_t padding,
size_t stride);
484 static void Im2col(Matrix_t &A,
492 size_t zeroPaddingHeight,
493 size_t zeroPaddingWidth);
495 static void Im2colIndices(std::vector<int> &V,
const Matrix_t &B,
size_t nLocalViews,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
496 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t zeroPaddingHeight,
497 size_t zeroPaddingWidth);
498 static void Im2colFast(Matrix_t &A,
const Matrix_t &B,
const std::vector<int> & V);
502 static void RotateWeights(Matrix_t &A,
const Matrix_t &B,
size_t filterDepth,
size_t filterHeight,
503 size_t filterWidth,
size_t numFilters);
506 static void AddConvBiases(Matrix_t &output,
const Matrix_t &biases);
510 static void PrepareInternals(Tensor_t &) {}
513 static void ConvLayerForward(Tensor_t & output,
514 Tensor_t & inputActivationFunc,
515 const Tensor_t &input,
516 const Matrix_t &weights,
const Matrix_t & biases,
517 const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
519 const ConvDescriptors_t & ,
534 static void ConvLayerBackward(Tensor_t &activationGradientsBackward,
535 Matrix_t &weightGradients, Matrix_t &biasGradients,
537 Tensor_t &activationGradients,
538 const Matrix_t &weights,
539 const Tensor_t &activationBackward,
540 const Tensor_t & outputTensor,
541 EActivationFunction activFunc,
542 const ConvDescriptors_t & ,
544 size_t batchSize,
size_t inputHeight,
545 size_t inputWidth,
size_t depth,
546 size_t height,
size_t width,
547 size_t filterDepth,
size_t filterHeight,
548 size_t filterWidth,
size_t nLocalViews );
552 static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward,
554 const Matrix_t &weights,
size_t batchSize,
555 size_t inputHeight,
size_t inputWidth,
size_t depth,
size_t height,
556 size_t width,
size_t filterDepth,
size_t filterHeight,
561 static void CalculateConvWeightGradients(Matrix_t &weightGradients,
563 const Tensor_t &activations_backward,
564 size_t batchSize,
size_t inputHeight,
size_t inputWidth,
size_t depth,
565 size_t height,
size_t width,
size_t filterDepth,
size_t filterHeight,
566 size_t filterWidth,
size_t nLocalViews);
570 static void CalculateConvBiasGradients(Matrix_t &biasGradients,
const Tensor_t &df,
571 size_t batchSize,
size_t depth,
size_t nLocalViews);
585 static void Downsample(Tensor_t &A, Tensor_t &B,
const Tensor_t &C,
586 const PoolingDescriptors_t & ,
587 PoolingWorkspace_t & ,
588 size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
589 size_t fltWidth,
size_t strideRows,
size_t strideCols);
599 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
600 const Tensor_t &activationGradients,
601 const Tensor_t &indexMatrix,
604 const PoolingDescriptors_t & ,
605 PoolingWorkspace_t & ,
625 static void Reshape(Matrix_t &A,
const Matrix_t &B);
629 static void Flatten(Tensor_t &A,
const Tensor_t &B);
633 static void Deflatten(Tensor_t &A,
const Tensor_t &B);
636 static void Rearrange(Tensor_t &out,
const Tensor_t &in);
640 static Matrix_t & RecurrentLayerBackward(Matrix_t & state_gradients_backward,
641 Matrix_t & input_weight_gradients,
642 Matrix_t & state_weight_gradients,
643 Matrix_t & bias_gradients,
645 const Matrix_t & state,
646 const Matrix_t & weights_input,
647 const Matrix_t & weights_state,
648 const Matrix_t & input,
649 Matrix_t & input_gradient);
669 static void Multiply(Matrix_t &C,
675 static void TransposeMultiply(Matrix_t &output,
676 const Matrix_t &input,
677 const Matrix_t &Weights,
678 Scalar_t alpha = 1.0, Scalar_t beta = 0.);
682 static void Hadamard(Tensor_t &A,
684 static void Hadamard(Matrix_t &A,
694 static void SumColumns(Matrix_t &B,
696 Scalar_t alpha = 1.0, Scalar_t beta = 0.);
699 static Scalar_t Sum(
const Matrix_t &A);
702 static bool AlmostEquals(
const Matrix_t &A,
const Matrix_t &B,
double epsilon = 0.1);
707 static void ConstAdd(Matrix_t &A, Scalar_t beta);
712 static void ConstMult(Matrix_t &A, Scalar_t beta);
717 static void ReciprocalElementWise(Matrix_t &A);
722 static void SquareElementWise(Matrix_t &A);
727 static void SqrtElementWise(Matrix_t &A);
730 static void AdamUpdate(Matrix_t & A,
const Matrix_t & M,
const Matrix_t & V, Scalar_t alpha, Scalar_t eps);
731 static void AdamUpdateFirstMom(Matrix_t & A,
const Matrix_t & B, Scalar_t beta);
732 static void AdamUpdateSecondMom(Matrix_t & A,
const Matrix_t & B, Scalar_t beta);
735 static void PrintTensor(
const Tensor_t & A,
const std::string name =
"Cuda-tensor",
bool =
false);
744 static void SumRows(Matrix_t & B,
const Matrix_t & A);
750 template <
typename AFloat>
751 template <
typename AMatrix_t>
752 void TCuda<AFloat>::CopyDiffArch(TCudaMatrix<AFloat> &B,
757 TMatrixT<AFloat> tmp = A;
758 Copy(B, TCudaMatrix<AFloat>(tmp) );
762 template <
typename AFloat>
763 template <
typename AMatrix_t>
764 void TCuda<AFloat>::CopyDiffArch(std::vector<TCudaMatrix<AFloat>> &B,
765 const std::vector<AMatrix_t> &A)
767 for (
size_t i = 0; i < B.size(); ++i) {
768 CopyDiffArch(B[i], A[i]);
772 template <
typename AFloat>
773 void TCuda<AFloat>::PrintTensor(
const typename TCuda<AFloat>::Tensor_t & A,
const std::string name,
bool )
775 std::cout << name <<
" size = " << A.GetSize() <<
" shape = { ";
776 auto shape = A.GetShape();
777 for (
size_t k = 0; k < shape.size()-1; ++k)
778 std::cout << shape[k] <<
" , ";
779 std::cout << shape.back() <<
" } ";
780 std::cout <<
" strides = { ";
781 auto strides = A.GetStrides();
782 for (
size_t k = 0; k < strides.size()-1; ++k)
783 std::cout << strides[k] <<
" , ";
784 std::cout << strides.back() <<
" }\n ";
786 if (A.GetShape().size() == 2 ) {
787 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
789 for (
size_t j = 0; j < A.GetShape()[1]; ++j) {
790 std::cout << A(i,j) <<
" ";
792 std::cout <<
" } " << std::endl;
794 }
else if (A.GetShape().size() == 3 ) {
795 for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
797 for (
size_t j = 0; j < A.GetHSize(); ++j) {
799 for (
size_t k = 0; k < A.GetWSize(); ++k) {
800 std::cout << A(i,j,k) <<
" ";
802 std::cout <<
" } " << std::endl;
804 std::cout <<
" } " << std::endl;
808 for (
size_t l = 0; l < A.GetSize(); ++l) {
809 std::cout << A.GetData()[l] <<
" ";