18 #ifndef TMVA_DNN_ARCHITECTURES_CPU 
   19 #define TMVA_DNN_ARCHITECTURES_CPU 
   42  struct DummyDescriptor {};
 
   43  struct DummyFilterDescriptor {};
 
   44  struct DummyConvolutionDescriptor {};
 
   45  struct DummyDropoutDescriptor {};
 
   46  struct DummyPoolingDescriptor {};
 
   47  struct DummyConvolutionFwdAlgo {};
 
   48  struct DummyConvolutionBwdDataAlgo {};
 
   49  struct DummyConvolutionBwdFilterAlgo {};
 
   50  struct DummyDataType {};
 
   52  struct DummyEmptyDescriptor {};
 
   61 template<
typename AReal = Float_t>
 
   65    static TRandom * fgRandomGen;
 
   68    using Scalar_t       = AReal;
 
   69    using Tensor_t       = TCpuTensor<AReal>;
 
   70    using Matrix_t       = TCpuMatrix<AReal>;
 
   71    using HostBuffer_t   = TCpuBuffer<AReal>;
 
   72    using DeviceBuffer_t = TCpuBuffer<AReal>;
 
   74    using ActivationDescriptor_t  = DummyDescriptor;
 
   75    using ConvolutionDescriptor_t = DummyDescriptor;
 
   76    using FilterDescriptor_t      = DummyDescriptor;
 
   77    using DropoutDescriptor_t     = DummyDescriptor;
 
   79    using PoolingDescriptor_t     = DummyDescriptor;
 
   80    using TensorDescriptor_t      = DummyDescriptor;
 
   82    using AlgorithmForward_t      = DummyConvolutionFwdAlgo;
 
   83    using AlgorithmBackward_t     = DummyConvolutionBwdDataAlgo;
 
   84    using AlgorithmHelper_t       = DummyConvolutionBwdFilterAlgo;
 
   85    using AlgorithmDataType_t     = DummyDataType;
 
   86    using ReduceTensorDescriptor_t = DummyDataType;
 
   88    using EmptyDescriptor_t       = DummyDescriptor;        
 
   90    using BNormLayer_t            = TBatchNormLayer<TCpu<AReal>>;
 
   91    using BNormDescriptors_t      = TDNNGenDescriptors<BNormLayer_t>;
 
   93    using ConvLayer_t             = CNN::TConvLayer<TCpu<AReal>>;
 
   94    using ConvDescriptors_t       = CNN::TCNNDescriptors<ConvLayer_t>;
 
   95    using ConvWorkspace_t         = CNN::TCNNWorkspace<ConvLayer_t>;
 
   96    using PoolingLayer_t          = CNN::TMaxPoolLayer<TCpu<AReal>>;
 
   97    using PoolingDescriptors_t    = CNN::TCNNDescriptors<PoolingLayer_t>;
 
   98    using PoolingWorkspace_t      = CNN::TCNNWorkspace<PoolingLayer_t>;
 
  100    static TMVA::Experimental::MemoryLayout GetTensorLayout() { 
return TMVA::Experimental::MemoryLayout::ColumnMajor; }
 
  102    static Tensor_t CreateTensor(
size_t n, 
size_t c, 
size_t h, 
size_t w) {
 
  103       return Tensor_t( {c,h*w,n}, GetTensorLayout());
 
  105    static Tensor_t CreateTensor(DeviceBuffer_t buffer, 
size_t n, 
size_t c, 
size_t h, 
size_t w) {
 
  106       return Tensor_t( buffer, {c,h*w,n}, GetTensorLayout());
 
  110    static void  CreateWeightTensors( std::vector<Matrix_t> & newWeights, 
const std::vector<Matrix_t> & weights) {
 
  111       if (!newWeights.empty()) newWeights.clear();
 
  112       size_t n =  weights.size();
 
  113       for (
size_t i = 0; i < n; ++i)
 
  114          newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
 
  123    static void InitializeBNormDescriptors(TDescriptors * & ,
 
  126    static void InitializeConvDescriptors(TDescriptors * & ,
 
  128    static void InitializePoolDescriptors(TDescriptors * & ,
 
  131    static void InitializeActivationDescriptor(ActivationDescriptor_t &, EActivationFunction  , 
double  = 0.0) {}
 
  134    static void ReleaseConvDescriptors(TDescriptors * & ) {}
 
  135    static void ReleasePoolDescriptors(TDescriptors * & ) {}
 
  136    static void ReleaseBNormDescriptors(TDescriptors * & ) {}
 
  138    static void InitializeConvWorkspace(TWorkspace * & ,
 
  140                                        const DNN::CNN::TConvParams & ,
 
  142    static void InitializePoolDropoutWorkspace(TWorkspace * & ,
 
  144                                        const DNN::CNN::TConvParams & ,
 
  147    static void FreeConvWorkspace(TWorkspace * & , ConvLayer_t *) {}   
 
  148    static void FreePoolDropoutWorkspace(TWorkspace * & , PoolingLayer_t *) {}
 
  150    static void ReleaseDescriptor(ActivationDescriptor_t &  ) {}
 
  164    static void MultiplyTranspose(Matrix_t &output, 
const Matrix_t &input, 
const Matrix_t &weights);
 
  166    static void MultiplyTranspose(Tensor_t &output, 
const Tensor_t &input, 
const Matrix_t &weights) {
 
  167       Matrix_t output_matrix = output.GetMatrix();
 
  168       MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
 
  173    static void AddRowWise(Matrix_t &output,
const Matrix_t &biases);
 
  175    static void AddRowWise(Tensor_t &output, 
const Matrix_t &biases) {
 
  176       Matrix_t output_matrix = output.GetMatrix();
 
  177       AddRowWise(output_matrix, biases);
 
  194    static void Backward(Tensor_t & activationGradientsBackward,
 
  195                         Matrix_t & weightGradients,
 
  196                         Matrix_t & biasGradients,
 
  198                         const Tensor_t & activationGradients,
 
  199                         const Matrix_t & weights,
 
  200                         const Tensor_t & activationBackward);
 
  205    static void ScaleAdd(Matrix_t & A,
 
  207                         Scalar_t beta = 1.0);
 
  209    static void Copy(Matrix_t & B,
 
  213    template<
typename AMatrix_t>
 
  214    static void CopyDiffArch(Matrix_t & B, 
const AMatrix_t & A);
 
  218    static void ScaleAdd(Tensor_t & A,
 
  220                         Scalar_t beta = 1.0);
 
  222    static void Copy(Tensor_t & A,
 
  226    template<
typename ATensor_t>
 
  227    static void CopyDiffArch(Tensor_t & A,
 
  228                      const ATensor_t & B);
 
  231    template<
typename AMatrix_t>
 
  232    static void CopyDiffArch(std::vector<Matrix_t>  & A,
 
  233                       const std::vector<AMatrix_t> & B);
 
  256    static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
 
  257                           const ActivationDescriptor_t activationDescr,
 
  258                           const double coef = 0.0, 
const Scalar_t alpha = 1,
 
  259                           const Scalar_t beta = 0);
 
  262    static void ActivationFunctionBackward(Tensor_t & dX, 
const Tensor_t & Y,
 
  263                                           const Tensor_t & dY,  
const Tensor_t & X,
 
  264                                           EActivationFunction activFunct,
 
  265                                           const ActivationDescriptor_t activationDescr,
 
  266                                           const Scalar_t alpha = 1,
 
  267                                           const Scalar_t beta = 0);
 
  269    static void IdentityDerivative(Tensor_t & B,
 
  272    static void Relu(Tensor_t & B);
 
  273    static void ReluDerivative(Tensor_t & B,
 
  276    static void Sigmoid(Tensor_t & B);
 
  277    static void SigmoidDerivative(Tensor_t & B,
 
  280    static void Tanh(Tensor_t & B);
 
  281    static void TanhDerivative(Tensor_t & B,
 
  284    static void SymmetricRelu(Tensor_t & B);
 
  285    static void SymmetricReluDerivative(Tensor_t & B,
 
  288    static void SoftSign(Tensor_t & B);
 
  289    static void SoftSignDerivative(Tensor_t & B,
 
  292    static void Gauss(Tensor_t & B);
 
  293    static void GaussDerivative(Tensor_t & B,
 
  311    static Scalar_t MeanSquaredError(
const Matrix_t &Y, 
const Matrix_t &output,
 
  312                                     const Matrix_t &weights);
 
  313    static void MeanSquaredErrorGradients(Matrix_t &dY, 
const Matrix_t &Y,
 
  314                                          const Matrix_t &output, 
const Matrix_t &weights);
 
  318    static Scalar_t CrossEntropy(
const Matrix_t &Y, 
const Matrix_t &output,
 
  319                                 const Matrix_t &weights);
 
  321    static void CrossEntropyGradients(Matrix_t &dY, 
const Matrix_t &Y,
 
  322                                      const Matrix_t &output, 
const Matrix_t &weights);
 
  326    static Scalar_t SoftmaxCrossEntropy(
const Matrix_t &Y, 
const Matrix_t &output,
 
  327                                        const Matrix_t &weights);
 
  328    static void SoftmaxCrossEntropyGradients(Matrix_t &dY, 
const Matrix_t &Y,
 
  329                                             const Matrix_t &output, 
const Matrix_t &weights);
 
  345    static void Sigmoid(Matrix_t &YHat,
 
  347    static void Softmax(Matrix_t &YHat,
 
  365    static Scalar_t L1Regularization(
const Matrix_t & W);
 
  366    static void AddL1RegularizationGradients(Matrix_t & A,
 
  368                                             Scalar_t weightDecay);
 
  370    static Scalar_t L2Regularization(
const Matrix_t & W);
 
  371    static void AddL2RegularizationGradients(Matrix_t & A,
 
  373                                             Scalar_t weightDecay);
 
  388    static void InitializeGauss(Matrix_t & A);
 
  389    static void InitializeUniform(Matrix_t & A);
 
  390    static void InitializeIdentity(Matrix_t & A);
 
  391    static void InitializeZero(Matrix_t & A);
 
  392    static void InitializeGlorotNormal(Matrix_t & A);
 
  393    static void InitializeGlorotUniform(Matrix_t & A);
 
  397    static TRandom & GetRandomGenerator();
 
  400    static void SetRandomSeed(
size_t seed);
 
  414    static void DropoutForward(Tensor_t & A,
 
  415                               TDescriptors * descriptors,
 
  416                               TWorkspace   * workspace,
 
  419    static void DropoutForward(Matrix_t & A, Scalar_t p) {
 
  421       DropoutForward( tA, static_cast<TDescriptors *> (
nullptr), static_cast<TWorkspace *> (
nullptr), p );
 
  425    static void DropoutBackward(Tensor_t & ,
 
  443    static void BatchNormLayerForwardTraining(
int axis, 
const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
 
  444                                              Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
 
  445                                              Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
 
  446                                              Scalar_t epsilon, 
const TensorDescriptor_t &bnParDescriptor);
 
  451    static void BatchNormLayerForwardInference(
int axis, 
const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
 
  452                                               Tensor_t &y, 
const Matrix_t &runningMeans,
 
  453                                               const Matrix_t &runningVars, Scalar_t epsilon,
 
  454                                               const TensorDescriptor_t &);
 
  458    static void BatchNormLayerBackward(
int axis, 
const Tensor_t &x, 
const Tensor_t &dy, Tensor_t &dx,
 
  460                                       Matrix_t &dgamma, Matrix_t &dbeta, 
const Matrix_t &mean, 
const Matrix_t &variance,
 
  461                                       const Matrix_t &iVariance, Scalar_t epsilon, 
const TensorDescriptor_t &);
 
  464    static Tensor_t BatchNormLayerReshapeTensor(
int axis, 
const Tensor_t &x);
 
  479    static size_t calculateDimension(
size_t imgDim, 
size_t fltDim, 
size_t padding, 
size_t stride);
 
  483    static void Im2col(Matrix_t &A, 
const Matrix_t &B, 
size_t imgHeight, 
size_t imgWidth, 
size_t fltHeight,
 
  484                       size_t fltWidth, 
size_t strideRows, 
size_t strideCols, 
size_t zeroPaddingHeight,
 
  485                       size_t zeroPaddingWidth);
 
  487    static void Im2colIndices(std::vector<int> &V, 
const Matrix_t &B, 
size_t nLocalViews, 
size_t imgHeight,
 
  488                              size_t imgWidth, 
size_t fltHeight, 
size_t fltWidth, 
size_t strideRows, 
size_t strideCols,
 
  489                              size_t zeroPaddingHeight, 
size_t zeroPaddingWidth);
 
  490    static void Im2colFast(Matrix_t &A, 
const Matrix_t &B, 
const std::vector<int> &V);
 
  494    static void RotateWeights(Matrix_t &A, 
const Matrix_t &B, 
size_t filterDepth, 
size_t filterHeight,
 
  495                              size_t filterWidth, 
size_t numFilters);
 
  498    static void AddConvBiases(Matrix_t &output, 
const Matrix_t &biases);
 
  502    static void PrepareInternals(Tensor_t &) {}
 
  505    static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, 
const Tensor_t &input,
 
  506                                 const Matrix_t &weights, 
const Matrix_t &biases, 
const DNN::CNN::TConvParams ¶ms,
 
  507                                 EActivationFunction activFunc, Tensor_t & ,
 
  508                                 const ConvDescriptors_t & , 
 
  525    ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients,
 
  526                      Tensor_t &df, Tensor_t &activationGradients, 
const Matrix_t &weights,
 
  527                      const Tensor_t &activationBackward, 
const Tensor_t &outputTensor, EActivationFunction activFunc,
 
  528                      const ConvDescriptors_t & , ConvWorkspace_t & , 
size_t batchSize,
 
  529                      size_t inputHeight, 
size_t inputWidth, 
size_t depth, 
size_t height, 
size_t width,
 
  530                      size_t filterDepth, 
size_t filterHeight, 
size_t filterWidth, 
size_t nLocalViews);
 
  534    static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, 
const Tensor_t &df,
 
  535                                                 const Matrix_t &weights, 
size_t batchSize, 
size_t inputHeight,
 
  536                                                 size_t inputWidth, 
size_t depth, 
size_t height, 
size_t width,
 
  537                                                 size_t filterDepth, 
size_t filterHeight, 
size_t filterWidth);
 
  541    static void CalculateConvWeightGradients(Matrix_t &weightGradients, 
const Tensor_t &df,
 
  542                                             const Tensor_t &activations_backward, 
size_t batchSize, 
size_t inputHeight,
 
  543                                             size_t inputWidth, 
size_t depth, 
size_t height, 
size_t width,
 
  544                                             size_t filterDepth, 
size_t filterHeight, 
size_t filterWidth,
 
  549    static void CalculateConvBiasGradients(Matrix_t &biasGradients, 
const Tensor_t &df, 
size_t batchSize, 
size_t depth,
 
  564    static void Downsample(Tensor_t &A, Tensor_t &B, 
const Tensor_t &C, 
const PoolingDescriptors_t & ,
 
  565                           PoolingWorkspace_t & , 
size_t imgHeight, 
size_t imgWidth, 
size_t fltHeight,
 
  566                           size_t fltWidth, 
size_t strideRows, 
size_t strideCols);
 
  576    static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, 
const Tensor_t &activationGradients,
 
  577                                     const Tensor_t &indexMatrix, 
const Tensor_t & ,
 
  578                                     const Tensor_t & , 
const PoolingDescriptors_t & ,
 
  579                                     PoolingWorkspace_t & , 
size_t imgHeight, 
size_t imgWidth,
 
  580                                     size_t fltHeight, 
size_t fltWidth, 
size_t strideRows, 
size_t strideCols,
 
  594    static void Reshape(Matrix_t &A, 
const Matrix_t &B);
 
  598    static void Flatten(Tensor_t &A, 
const Tensor_t &B); 
 
  602    static void Deflatten(Tensor_t &A, 
const Tensor_t &B); 
 
  605    static void Rearrange(Tensor_t &out, 
const Tensor_t &in);
 
  608    static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, 
 
  609                                            Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients,
 
  610                                            Matrix_t &bias_gradients,
 
  612                                            const Matrix_t &state,         
 
  613                                            const Matrix_t &weights_input, 
 
  614                                            const Matrix_t &weights_state, 
 
  615                                            const Matrix_t &input,         
 
  616                                            Matrix_t &input_gradient);
 
  635    static void Multiply(Matrix_t &C, 
const Matrix_t &A, 
const Matrix_t &B);
 
  639    static void TransposeMultiply(Matrix_t &output, 
const Matrix_t &input, 
const Matrix_t &Weights, Scalar_t alpha = 1.0,
 
  644    static void Hadamard(Tensor_t &A, 
const Tensor_t &B);
 
  645    static void Hadamard(Matrix_t &A, 
const Matrix_t &B);
 
  654    static void SumColumns(Matrix_t &B, 
const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.);
 
  657    static Scalar_t Sum(
const Matrix_t &A);
 
  660    static bool AlmostEquals(
const Matrix_t &A, 
const Matrix_t &B, 
double epsilon = 0.1);
 
  665    static void ConstAdd(Matrix_t &A, Scalar_t beta);
 
  670    static void ConstMult(Matrix_t &A, Scalar_t beta);
 
  675    static void ReciprocalElementWise(Matrix_t &A);
 
  680    static void SquareElementWise(Matrix_t &A);
 
  685    static void SqrtElementWise(Matrix_t &A);
 
  688    static void AdamUpdate(Matrix_t &A, 
const Matrix_t &M, 
const Matrix_t &V, Scalar_t alpha, Scalar_t eps);
 
  689    static void AdamUpdateFirstMom(Matrix_t &A, 
const Matrix_t &B, Scalar_t beta);
 
  690    static void AdamUpdateSecondMom(Matrix_t &A, 
const Matrix_t &B, Scalar_t beta);
 
  693    static void PrintTensor(
const Tensor_t &A, 
const std::string name = 
"Cpu-tensor", 
bool truncate = 
false);
 
  698 template <
typename AReal>
 
  699 template <
typename AMatrix_t>
 
  700 void TCpu<AReal>::CopyDiffArch(TCpuMatrix<AReal> &B,
 
  705    TMatrixT<AReal> tmp = A;  
 
  706    Copy(B, TCpuMatrix<AReal>(tmp) );
 
  710 template <
typename AReal>
 
  711 template <
typename ATensor_t>
 
  712 void TCpu<AReal>::CopyDiffArch(TCpuTensor<AReal> &B,
 
  716    R__ASSERT(A.GetSize() == B.GetSize());
 
  718    for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
 
  719       TMatrixT<AReal> tmpIn = A.At(i);  
 
  721       TCpuMatrix<AReal> tmpOut = B.At(i).GetMatrix();    
 
  722       Copy(tmpOut, TCpuMatrix<AReal>(tmpIn));
 
  732 template <
typename AReal>
 
  733 template <
typename AMatrix_t>
 
  734 void TCpu<AReal>::CopyDiffArch(std::vector<TCpuMatrix<AReal>> &A, 
const std::vector<AMatrix_t> &B)
 
  736    for (
size_t i = 0; i < A.size(); ++i) {
 
  737       CopyDiffArch(A[i], B[i]);
 
  741 template <
typename AReal>
 
  742 void TCpu<AReal>::PrintTensor(
const typename TCpu<AReal>::Tensor_t & A, 
const std::string name, 
bool truncate )
 
  744    std::cout << name << 
" size = " << A.GetSize() << 
" shape = { ";
 
  745    auto shape = A.GetShape();
 
  746    for (
size_t k = 0; k < shape.size()-1; ++k)
 
  747       std::cout << shape[k] << 
" , ";
 
  748    std::cout << shape.back() << 
" } ";
 
  752    std::cout << 
" tensor count " << A.GetBufferUseCount() << std::endl;
 
  753    if (A.GetShape().size() == 2 ) {
 
  754       for (
size_t i = 0; i < A.GetShape()[0]; ++i) {
 
  756          size_t n =  A.GetShape()[1];
 
  757          if (truncate) n = std::min(n,
size_t(10));
 
  758          for (
size_t j = 0; j < n; ++j) {
 
  759             std::cout << A(i,j) << 
" ";
 
  761           if (truncate && n < A.GetShape()[1]) std::cout << 
" ...... ";
 
  762          std::cout << 
" } " << std::endl;
 
  764    } 
else if  (A.GetShape().size() == 3 ) {
 
  765       for (
size_t i = 0; i < A.GetFirstSize(); ++i) {
 
  767          for (
size_t j = 0; j < A.GetHSize(); ++j) {
 
  769             size_t n =  A.GetWSize();
 
  770             if (truncate)  n = std::min(n,
size_t(10));
 
  771             for (
size_t k = 0; k < n; ++k) {
 
  772                std::cout << A(i,j,k) << 
" ";
 
  774             if (truncate && n < A.GetWSize()) std::cout << 
" ...... ";
 
  775             std::cout << 
" } " << std::endl;
 
  777          std::cout << 
" } " << std::endl;
 
  781       for (
size_t l = 0; l < A.GetSize(); ++l) {
 
  782          std::cout << A.GetData()[l] << 
" ";