18 #ifndef TMVA_DNN_ARCHITECTURES_CPU
19 #define TMVA_DNN_ARCHITECTURES_CPU
42 struct DummyDescriptor {};
43 struct DummyFilterDescriptor {};
44 struct DummyConvolutionDescriptor {};
45 struct DummyDropoutDescriptor {};
46 struct DummyPoolingDescriptor {};
47 struct DummyConvolutionFwdAlgo {};
48 struct DummyConvolutionBwdDataAlgo {};
49 struct DummyConvolutionBwdFilterAlgo {};
50 struct DummyDataType {};
52 struct DummyEmptyDescriptor {};
61 template<
typename AReal = Float_t>
65 static TRandom * fgRandomGen;
68 using Scalar_t = AReal;
69 using Tensor_t = TCpuTensor<AReal>;
70 using Matrix_t = TCpuMatrix<AReal>;
71 using HostBuffer_t = TCpuBuffer<AReal>;
72 using DeviceBuffer_t = TCpuBuffer<AReal>;
74 using ActivationDescriptor_t = DummyDescriptor;
75 using ConvolutionDescriptor_t = DummyDescriptor;
76 using FilterDescriptor_t = DummyDescriptor;
77 using DropoutDescriptor_t = DummyDescriptor;
79 using PoolingDescriptor_t = DummyDescriptor;
80 using TensorDescriptor_t = DummyDescriptor;
82 using AlgorithmForward_t = DummyConvolutionFwdAlgo;
83 using AlgorithmBackward_t = DummyConvolutionBwdDataAlgo;
84 using AlgorithmHelper_t = DummyConvolutionBwdFilterAlgo;
85 using AlgorithmDataType_t = DummyDataType;
86 using ReduceTensorDescriptor_t = DummyDataType;
88 using EmptyDescriptor_t = DummyDescriptor;
90 using BNormLayer_t = TBatchNormLayer<TCpu<AReal>>;
91 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
93 using ConvLayer_t = CNN::TConvLayer<TCpu<AReal>>;
94 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
95 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
96 using PoolingLayer_t = CNN::TMaxPoolLayer<TCpu<AReal>>;
97 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
98 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
100 static TMVA::Experimental::MemoryLayout GetTensorLayout() {
return TMVA::Experimental::MemoryLayout::ColumnMajor; }
102 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
103 return Tensor_t( {c,h*w,n}, GetTensorLayout());
105 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
106 return Tensor_t( buffer, {c,h*w,n}, GetTensorLayout());
110 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
111 if (!newWeights.empty()) newWeights.clear();
112 size_t n = weights.size();
113 for (
size_t i = 0; i < n; ++i)
114 newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
123 static void InitializeBNormDescriptors(TDescriptors * & ,
126 static void InitializeConvDescriptors(TDescriptors * & ,
128 static void InitializePoolDescriptors(TDescriptors * & ,
131 static void InitializeActivationDescriptor(ActivationDescriptor_t &, EActivationFunction ,
double = 0.0) {}
134 static void ReleaseConvDescriptors(TDescriptors * & ) {}
135 static void ReleasePoolDescriptors(TDescriptors * & ) {}
136 static void ReleaseBNormDescriptors(TDescriptors * & ) {}
138 static void InitializeConvWorkspace(TWorkspace * & ,
140 const DNN::CNN::TConvParams & ,
142 static void InitializePoolDropoutWorkspace(TWorkspace * & ,
144 const DNN::CNN::TConvParams & ,
147 static void FreeConvWorkspace(TWorkspace * & , ConvLayer_t *) {}
148 static void FreePoolDropoutWorkspace(TWorkspace * & , PoolingLayer_t *) {}
150 static void ReleaseDescriptor(ActivationDescriptor_t & ) {}
164 static void MultiplyTranspose(Matrix_t &output,
const Matrix_t &input,
const Matrix_t &weights);
166 static void MultiplyTranspose(Tensor_t &output,
const Tensor_t &input,
const Matrix_t &weights) {
167 Matrix_t output_matrix = output.GetMatrix();
168 MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
173 static void AddRowWise(Matrix_t &output,
const Matrix_t &biases);
175 static void AddRowWise(Tensor_t &output,
const Matrix_t &biases) {
176 Matrix_t output_matrix = output.GetMatrix();
177 AddRowWise(output_matrix, biases);
194 static void Backward(Tensor_t & activationGradientsBackward,
195 Matrix_t & weightGradients,
196 Matrix_t & biasGradients,
198 const Tensor_t & activationGradients,
199 const Matrix_t & weights,
200 const Tensor_t & activationBackward);
205 static void ScaleAdd(Matrix_t & A,
207 Scalar_t beta = 1.0);
209 static void Copy(Matrix_t & B,
213 template<
typename AMatrix_t>
214 static void CopyDiffArch(Matrix_t & B,
const AMatrix_t & A);
218 static void ScaleAdd(Tensor_t & A,
220 Scalar_t beta = 1.0);
222 static void Copy(Tensor_t & A,
226 template<
typename ATensor_t>
227 static void CopyDiffArch(Tensor_t & A,
228 const ATensor_t & B);
231 template<
typename AMatrix_t>
232 static void CopyDiffArch(std::vector<Matrix_t> & A,
233 const std::vector<AMatrix_t> & B);
256 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
257 const ActivationDescriptor_t activationDescr,
258 const double coef = 0.0,
const Scalar_t alpha = 1,
259 const Scalar_t beta = 0);
262 static void ActivationFunctionBackward(Tensor_t & dX,
const Tensor_t & Y,
263 const Tensor_t & dY,
const Tensor_t & X,
264 EActivationFunction activFunct,
265 const ActivationDescriptor_t activationDescr,
266 const Scalar_t alpha = 1,
267 const Scalar_t beta = 0);
269 static void IdentityDerivative(Tensor_t & B,
272 static void Relu(Tensor_t & B);
273 static void ReluDerivative(Tensor_t & B,
276 static void Sigmoid(Tensor_t & B);
277 static void SigmoidDerivative(Tensor_t & B,
280 static void Tanh(Tensor_t & B);
281 static void TanhDerivative(Tensor_t & B,
284 static void SymmetricRelu(Tensor_t & B);
285 static void SymmetricReluDerivative(Tensor_t & B,
288 static void SoftSign(Tensor_t & B);
289 static void SoftSignDerivative(Tensor_t & B,
292 static void Gauss(Tensor_t & B);
293 static void GaussDerivative(Tensor_t & B,
311 static Scalar_t MeanSquaredError(
const Matrix_t &Y,
const Matrix_t &output,
312 const Matrix_t &weights);
313 static void MeanSquaredErrorGradients(Matrix_t &dY,
const Matrix_t &Y,
314 const Matrix_t &output,
const Matrix_t &weights);
318 static Scalar_t CrossEntropy(
const Matrix_t &Y,
const Matrix_t &output,
319 const Matrix_t &weights);
321 static void CrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
322 const Matrix_t &output,
const Matrix_t &weights);
326 static Scalar_t SoftmaxCrossEntropy(
const Matrix_t &Y,
const Matrix_t &output,
327 const Matrix_t &weights);
328 static void SoftmaxCrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
329 const Matrix_t &output,
const Matrix_t &weights);
345 static void Sigmoid(Matrix_t &YHat,
347 static void Softmax(Matrix_t &YHat,
365 static Scalar_t L1Regularization(
const Matrix_t & W);
366 static void AddL1RegularizationGradients(Matrix_t & A,
368 Scalar_t weightDecay);
370 static Scalar_t L2Regularization(
const Matrix_t & W);
371 static void AddL2RegularizationGradients(Matrix_t & A,
373 Scalar_t weightDecay);
388 static void InitializeGauss(Matrix_t & A);
389 static void InitializeUniform(Matrix_t & A);
390 static void InitializeIdentity(Matrix_t & A);
391 static void InitializeZero(Matrix_t & A);
392 static void InitializeGlorotNormal(Matrix_t & A);
393 static void InitializeGlorotUniform(Matrix_t & A);
397 static TRandom & GetRandomGenerator();
400 static void SetRandomSeed(
size_t seed);
414 static void DropoutForward(Tensor_t & A,
415 TDescriptors * descriptors,
416 TWorkspace * workspace,
419 static void DropoutForward(Matrix_t & A, Scalar_t p) {
421 DropoutForward( tA, static_cast<TDescriptors *> (
nullptr), static_cast<TWorkspace *> (
nullptr), p );
425 static void DropoutBackward(Tensor_t & ,
443 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
444 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
445 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
446 Scalar_t epsilon,
const TensorDescriptor_t &bnParDescriptor);
451 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
452 Tensor_t &y,
const Matrix_t &runningMeans,
453 const Matrix_t &runningVars, Scalar_t epsilon,
454 const TensorDescriptor_t &);
458 static void BatchNormLayerBackward(
int axis,
const Tensor_t &x,
const Tensor_t &dy, Tensor_t &dx,
460 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
const Matrix_t &variance,
461 const Matrix_t &iVariance, Scalar_t epsilon,
const TensorDescriptor_t &);
464 static Tensor_t BatchNormLayerReshapeTensor(
int axis,
const Tensor_t &x);
479 static size_t calculateDimension(
size_t imgDim,
size_t fltDim,
size_t padding,
size_t stride);
483 static void Im2col(Matrix_t &A,
const Matrix_t &B,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
484 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t zeroPaddingHeight,
485 size_t zeroPaddingWidth);
487 static void Im2colIndices(std::vector<int> &V,
const Matrix_t &B,
size_t nLocalViews,
size_t imgHeight,
488 size_t imgWidth,
size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
489 size_t zeroPaddingHeight,
size_t zeroPaddingWidth);
490 static void Im2colFast(Matrix_t &A,
const Matrix_t &B,
const std::vector<int> &V);
494 static void RotateWeights(Matrix_t &A,
const Matrix_t &B,
size_t filterDepth,
size_t filterHeight,
495 size_t filterWidth,
size_t numFilters);
498 static void AddConvBiases(Matrix_t &output,
const Matrix_t &biases);
502 static void PrepareInternals(Tensor_t &) {}
505 static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc,
const Tensor_t &input,
506 const Matrix_t &weights,
const Matrix_t &biases,
const DNN::CNN::TConvParams ¶ms,
507 EActivationFunction activFunc, Tensor_t & ,
508 const ConvDescriptors_t & ,
525 ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients,
526 Tensor_t &df, Tensor_t &activationGradients,
const Matrix_t &weights,
527 const Tensor_t &activationBackward,
const Tensor_t &outputTensor, EActivationFunction activFunc,
528 const ConvDescriptors_t & , ConvWorkspace_t & ,
size_t batchSize,
529 size_t inputHeight,
size_t inputWidth,
size_t depth,
size_t height,
size_t width,
530 size_t filterDepth,
size_t filterHeight,
size_t filterWidth,
size_t nLocalViews);
534 static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward,
const Tensor_t &df,
535 const Matrix_t &weights,
size_t batchSize,
size_t inputHeight,
536 size_t inputWidth,
size_t depth,
size_t height,
size_t width,
537 size_t filterDepth,
size_t filterHeight,
size_t filterWidth);
541 static void CalculateConvWeightGradients(Matrix_t &weightGradients,
const Tensor_t &df,
542 const Tensor_t &activations_backward,
size_t batchSize,
size_t inputHeight,
543 size_t inputWidth,
size_t depth,
size_t height,
size_t width,
544 size_t filterDepth,
size_t filterHeight,
size_t filterWidth,
549 static void CalculateConvBiasGradients(Matrix_t &biasGradients,
const Tensor_t &df,
size_t batchSize,
size_t depth,
564 static void Downsample(Tensor_t &A, Tensor_t &B,
const Tensor_t &C,
const PoolingDescriptors_t & ,
565 PoolingWorkspace_t & ,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
566 size_t fltWidth,
size_t strideRows,
size_t strideCols);
576 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
const Tensor_t &activationGradients,
577 const Tensor_t &indexMatrix,
const Tensor_t & ,
578 const Tensor_t & ,
const PoolingDescriptors_t & ,
579 PoolingWorkspace_t & ,
size_t imgHeight,
size_t imgWidth,
580 size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
594 static void Reshape(Matrix_t &A,
const Matrix_t &B);
598 static void Flatten(Tensor_t &A,
const Tensor_t &B);
602 static void Deflatten(Tensor_t &A,
const Tensor_t &B);
605 static void Rearrange(Tensor_t &out,
const Tensor_t &in);
608 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward,
609 Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients,
610 Matrix_t &bias_gradients,
612 const Matrix_t &state,
613 const Matrix_t &weights_input,
614 const Matrix_t &weights_state,
615 const Matrix_t &input,
616 Matrix_t &input_gradient);
635 static void Multiply(Matrix_t &C,
const Matrix_t &A,
const Matrix_t &B);
639 static void TransposeMultiply(Matrix_t &output,
const Matrix_t &input,
const Matrix_t &Weights, Scalar_t alpha = 1.0,
644 static void Hadamard(Tensor_t &A,
const Tensor_t &B);
645 static void Hadamard(Matrix_t &A,
const Matrix_t &B);
654 static void SumColumns(Matrix_t &B,
const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.);
657 static Scalar_t Sum(
const Matrix_t &A);
660 static bool AlmostEquals(
const Matrix_t &A,
const Matrix_t &B,
double epsilon = 0.1);
665 static void ConstAdd(Matrix_t &A, Scalar_t beta);
670 static void ConstMult(Matrix_t &A, Scalar_t beta);
675 static void ReciprocalElementWise(Matrix_t &A);
680 static void SquareElementWise(Matrix_t &A);
685 static void SqrtElementWise(Matrix_t &A);
688 static void AdamUpdate(Matrix_t &A,
const Matrix_t &M,
const Matrix_t &V, Scalar_t alpha, Scalar_t eps);
689 static void AdamUpdateFirstMom(Matrix_t &A,
const Matrix_t &B, Scalar_t beta);
690 static void AdamUpdateSecondMom(Matrix_t &A,
const Matrix_t &B, Scalar_t beta);
693 static void PrintTensor(
const Tensor_t &A,
const std::string name =
"Cpu-tensor",
bool truncate =
false);
698 template <
typename AReal>
699 template <
typename AMatrix_t>
700 void TCpu<AReal>::CopyDiffArch(TCpuMatrix<AReal> &B,
705 TMatrixT<AReal> tmp = A;
706 Copy(B, TCpuMatrix<AReal>(tmp) );
710 template <
typename AReal>
711 template <
typename ATensor_t>
712 void TCpu<AReal>::CopyDiffArch(TCpuTensor<AReal> &B,
716 R__ASSERT(A.GetSize() == B.GetSize());
718 for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
719 TMatrixT<AReal> tmpIn = A.At(i);
721 TCpuMatrix<AReal> tmpOut = B.At(i).GetMatrix();
722 Copy(tmpOut, TCpuMatrix<AReal>(tmpIn));
732 template <
typename AReal>
733 template <
typename AMatrix_t>
734 void TCpu<AReal>::CopyDiffArch(std::vector<TCpuMatrix<AReal>> &A,
const std::vector<AMatrix_t> &B)
736 for (
size_t i = 0; i < A.size(); ++i) {
737 CopyDiffArch(A[i], B[i]);
741 template <
typename AReal>
742 void TCpu<AReal>::PrintTensor(
const typename TCpu<AReal>::Tensor_t & A,
const std::string name,
bool truncate )
744 std::cout << name <<
" size = " << A.GetSize() <<
" shape = { ";
745 auto shape = A.GetShape();
746 for (
size_t k = 0; k < shape.size()-1; ++k)
747 std::cout << shape[k] <<
" , ";
748 std::cout << shape.back() <<
" } ";
752 std::cout <<
" tensor count " << A.GetBufferUseCount() << std::endl;
753 if (A.GetShape().size() == 2 ) {
754 for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
756 size_t n = A.GetShape()[1];
757 if (truncate) n = std::min(n,
size_t(10));
758 for (
size_t j = 0; j < n; ++j) {
759 std::cout << A(i,j) <<
" ";
761 if (truncate && n < A.GetShape()[1]) std::cout <<
" ...... ";
762 std::cout <<
" } " << std::endl;
764 }
else if (A.GetShape().size() == 3 ) {
765 for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
767 for (
size_t j = 0; j < A.GetHSize(); ++j) {
769 size_t n = A.GetWSize();
770 if (truncate) n = std::min(n,
size_t(10));
771 for (
size_t k = 0; k < n; ++k) {
772 std::cout << A(i,j,k) <<
" ";
774 if (truncate && n < A.GetWSize()) std::cout <<
" ...... ";
775 std::cout <<
" } " << std::endl;
777 std::cout <<
" } " << std::endl;
781 for (
size_t l = 0; l < A.GetSize(); ++l) {
782 std::cout << A.GetData()[l] <<
" ";