32 template <
typename AFloat>
 
   33 void TCpu<AFloat>::MultiplyTranspose(TCpuMatrix<AFloat> &output, 
const TCpuMatrix<AFloat> &input,
 
   34                                      const TCpuMatrix<AFloat> &Weights)
 
   37    int m = (int)input.GetNrows();
 
   38    int k = (int)input.GetNcols();
 
   39    int n = (int)Weights.GetNrows();
 
   41    if ((
int)output.GetNrows() != m) {
 
   42       Error(
"MultiplyTranspose",
"Invalid input - output  rows  - input:  %d != output : %d",m, (
int) output.GetNrows());
 
   43       R__ASSERT((
int) output.GetNrows() == m);
 
   45    if ((
int)output.GetNcols() != n) {
 
   46       Error(
"MultiplyTranspose",
"Invalid output cols or weight  rows  - output cols:  %d != weight rows : %d",(
int) output.GetNcols(),n);
 
   47       R__ASSERT((
int) output.GetNcols() == n);
 
   49    if ((
int)Weights.GetNcols() != k) {
 
   50       Error(
"MultiplyTranspose",
"Invalid input cols or weight cols  - input cols:  %d != weight cols : %d", k, (
int) Weights.GetNcols());
 
   51       R__ASSERT((
int) Weights.GetNcols() == k);
 
   62    const AFloat *A = input.GetRawDataPointer();
 
   63    const AFloat *B = Weights.GetRawDataPointer();
 
   64    AFloat *C = output.GetRawDataPointer();
 
   66    ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, A, &m, B, &n, &beta, C, &m);
 
   68    TMatrixT<AFloat> tmp(output.GetNrows(), output.GetNcols());
 
   69    tmp.MultT( input,Weights);
 
   74 template <
typename AFloat>
 
   75 void TCpu<AFloat>::AddRowWise(TCpuMatrix<AFloat> &output, 
const TCpuMatrix<AFloat> &biases)
 
   78    int m = (int)output.GetNrows();
 
   79    int n = (int)output.GetNcols();
 
   84    AFloat *A = output.GetRawDataPointer();
 
   85    const AFloat *x = TCpuMatrix<AFloat>::GetOnePointer();
 
   86    const AFloat *y = biases.GetRawDataPointer();
 
   88    R__ASSERT(m <= (
int)TCpuMatrix<AFloat>::GetOnePointerSize());
 
   89    R__ASSERT(n <= (
int)(biases.GetNcols()*biases.GetNrows()));
 
   91    ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
 
   94    TReference<AFloat>::AddRowWise(tmp, biases);
 
   99 template <
typename AFloat>
 
  100 void TCpu<AFloat>::Backward(TCpuTensor<AFloat> &activationGradientsBackward, TCpuMatrix<AFloat> &weightGradients,
 
  101                             TCpuMatrix<AFloat> &biasGradients, 
const TCpuTensor<AFloat> &df,
 
  102                             const TCpuTensor<AFloat> &, 
const TCpuMatrix<AFloat> &weights,
 
  103                             const TCpuTensor<AFloat> &activationsBackward)
 
  108    Matrix_t df_m = df.GetMatrix();
 
  111    if (activationGradientsBackward.GetSize() > 0 ) {
 
  113       Matrix_t  activationGradientsBackward_m = activationGradientsBackward.GetMatrix();
 
  115       Multiply(activationGradientsBackward_m, df_m, weights);
 
  119    if (weightGradients.GetNoElements() > 0) TransposeMultiply(weightGradients, df_m, activationsBackward.GetMatrix());
 
  125    if (biasGradients.GetNoElements() > 0) SumColumns(biasGradients, df_m);
 
  131 template <
typename AFloat>
 
  132 void TCpu<AFloat>::Im2col(TCpuMatrix<AFloat> &A, 
const TCpuMatrix<AFloat> &B, 
size_t imgHeight, 
size_t imgWidth,
 
  133                           size_t fltHeight, 
size_t fltWidth, 
size_t strideRows, 
size_t strideCols,
 
  134                           size_t zeroPaddingHeight, 
size_t zeroPaddingWidth)
 
  138    int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
 
  139    int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
 
  140    size_t currLocalView = 0;
 
  142    const int halfFltHeight =  fltHeight / 2;
 
  143    const int halfFltWidth =  fltWidth / 2;
 
  144    const int halfFltHeightM1 = (fltHeight - 1) / 2;
 
  145    const int halfFltWidthM1 = (fltWidth - 1) / 2;
 
  146    const int nRowsInput = B.GetNrows();
 
  147    const int nColsInput = B.GetNcols();
 
  148    const int nRowsOutput = A.GetNrows();
 
  149    const int nColsOutput = A.GetNcols();
 
  152    for (
int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
 
  153       for (
int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
 
  154          size_t currLocalViewPixel = 0;
 
  157          R__ASSERT((
int) currLocalView < nRowsOutput );
 
  159          for (
int m = 0; m < nRowsInput; m++) {
 
  160             for (
int k = i - halfFltHeight  ; k <= Int_t(i + halfFltHeightM1 ); k++) {
 
  161                int kstep = k * imgWidth;
 
  162                for (
int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
 
  165                   R__ASSERT((
int) currLocalViewPixel < nColsOutput );
 
  167                   if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >=  nColsInput)
 
  168                      A(currLocalView, currLocalViewPixel++) = 0;
 
  170                      A(currLocalView, currLocalViewPixel++) = B(m, kstep + l);
 
  182 template <
typename AFloat>
 
  183 void TCpu<AFloat>::Im2colIndices(std::vector<int> &V, 
const TCpuMatrix<AFloat> &B, 
size_t nLocalViews, 
size_t imgHeight, 
size_t imgWidth,
 
  184                           size_t fltHeight, 
size_t fltWidth, 
size_t strideRows, 
size_t strideCols,
 
  185                            size_t zeroPaddingHeight, 
size_t zeroPaddingWidth)
 
  189    int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
 
  190    int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
 
  191    size_t currLocalView = 0;
 
  193    const int halfFltHeight =  fltHeight / 2;
 
  194    const int halfFltWidth =  fltWidth / 2;
 
  195    const int halfFltHeightM1 = (fltHeight - 1) / 2;
 
  196    const int halfFltWidthM1 = (fltWidth - 1) / 2;
 
  197    const int nRowsInput = B.GetNrows();
 
  198    const int nColsInput = B.GetNcols();
 
  199    const size_t nSizeOutput = V.size();
 
  200    const int npixels =  nRowsInput * fltHeight * fltWidth;
 
  205    for (
int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
 
  206       for (
int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
 
  207          size_t currLocalViewPixel = 0;
 
  212          for (
int m = 0; m < nRowsInput; m++) {
 
  213             for (
int k = i - halfFltHeight  ; k <= Int_t(i + halfFltHeightM1 ); k++) {
 
  214                int kstep = k * imgWidth;
 
  215                for (
int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
 
  219                   R__ASSERT(currLocalView * npixels + currLocalViewPixel < nSizeOutput );
 
  220                   if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >=  nColsInput)
 
  222                      V[currLocalViewPixel * nLocalViews + currLocalView] = -1;
 
  224                      V[currLocalViewPixel * nLocalViews + currLocalView]= ( kstep + l) * nRowsInput + m;
 
  226                   currLocalViewPixel++;
 
  234 template <
typename AFloat>
 
  235 void TCpu<AFloat>::Im2colFast(TCpuMatrix<AFloat> &A, 
const TCpuMatrix<AFloat> &B, 
const std::vector<int> &V)
 
  238    R__ASSERT( n == A.GetNcols() * A.GetNrows() );
 
  239    AFloat *  a = A.GetRawDataPointer();
 
  240    const AFloat *  b = B.GetRawDataPointer();
 
  245    const size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(n);
 
  247    auto f = [&](UInt_t workerID)
 
  249       for (
size_t j = 0; j < nsteps; ++j) {
 
  250          size_t ii = workerID+j;
 
  253          if (idx >= 0) a[ii] = b[idx];
 
  259    A.GetThreadExecutor().Foreach(f, ROOT::TSeqI(0,n,nsteps) );
 
  263    for (
size_t ii = 0; ii < n; ++ii) {
 
  265       if (idx >= 0) a[ii] = b[idx];
 
  272 template <
typename AFloat>
 
  273 void TCpu<AFloat>::RotateWeights(TCpuMatrix<AFloat> &A, 
const TCpuMatrix<AFloat> &B, 
size_t filterDepth,
 
  274                                  size_t filterHeight, 
size_t filterWidth, 
size_t numFilters)
 
  276    size_t jump = filterHeight * filterWidth;
 
  277    for (
size_t j = 0; j < filterDepth; j++) {
 
  278       for (
size_t k = 0; k < numFilters; k++) {
 
  279          for (
size_t i = 0; i < jump; i++) {
 
  280             A(j, k * jump + i) = B(k, ((j + 1) * jump - 1) - i);
 
  288 template <
typename AFloat>
 
  289 void TCpu<AFloat>::AddConvBiases(TCpuMatrix<AFloat> &output, 
const TCpuMatrix<AFloat> &biases)
 
  291 #ifdef R__HAS_TMVACPU 
  292    int m = (int)output.GetNrows();
 
  293    int n = (int)output.GetNcols();
 
  298    AFloat *A = output.GetRawDataPointer();
 
  299    const AFloat *x = biases.GetRawDataPointer();
 
  300    const AFloat *y = TCpuMatrix<AFloat>::GetOnePointer();
 
  302    R__ASSERT(m <= (
int)biases.GetNoElements() );
 
  303    R__ASSERT(n <= (
int)TCpuMatrix<AFloat>::GetOnePointerSize() );
 
  305    ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
 
  307    TMatrixT<AFloat> tmp;
 
  308    TReference<AFloat>::AddConvBiases(tmp, biases);
 
  313 template<
typename AFloat>
 
  314 size_t TCpu<AFloat>::calculateDimension(
size_t imgDim, 
size_t fltDim, 
size_t padding, 
size_t stride)
 
  316    size_t temp = imgDim - fltDim + 2 * padding;
 
  317    if (temp % stride || temp + stride <= 0) {
 
  318       Fatal(
"calculateDimension", 
"Not compatible hyper parameters for layer - (imageDim, filterDim, padding, stride) " 
  319             "%zu, %zu, %zu, %zu", imgDim, fltDim, padding, stride);
 
  321    return temp / stride + 1;
 
  325 template <
typename AFloat>
 
  326 void TCpu<AFloat>::ConvLayerForward(TCpuTensor<AFloat> & output,
 
  327                                     TCpuTensor<AFloat> & inputActivationFunc,
 
  328                                     const TCpuTensor<AFloat> &input,
 
  329                                     const TCpuMatrix<AFloat> &weights, 
const TCpuMatrix<AFloat> & biases,
 
  330                                     const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
 
  331                                     TCpuTensor<AFloat> & ,
 
  332                                     const ConvDescriptors_t & ,
 
  335    size_t height = calculateDimension(params.inputHeight, params.filterHeight, params.paddingHeight, params.strideRows);
 
  336    size_t width = calculateDimension(params.inputWidth, params.filterWidth, params.paddingWidth, params.strideCols);
 
  337    size_t nLocalViews = height * width;
 
  338    size_t nLocalViewPixels = params.inputDepth * params.filterHeight * params.filterWidth;
 
  340    R__ASSERT( input.GetSize() > 0);
 
  341    std::vector<int> forwardIndices(nLocalViews * nLocalViewPixels);
 
  342    Im2colIndices(forwardIndices, input.At(0).GetMatrix(), nLocalViews, params.inputHeight, params.inputWidth, params.filterHeight,
 
  343                  params.filterWidth, params.strideRows, params.strideCols, params.paddingHeight, params.paddingWidth);
 
  346    TCpuMatrix<AFloat>::InitializeOneVector(nLocalViews);
 
  347    TCpuMatrix<AFloat>::InitializeOneVector(output.GetWSize());   
 
  350    auto f = [&] (UInt_t i)
 
  357        TCpuMatrix<AFloat> inputTr(nLocalViews, nLocalViewPixels);
 
  360        Im2colFast(inputTr, input.At(i).GetMatrix(), forwardIndices);
 
  362        Matrix_t output_m = output.At(i).GetMatrix();
 
  363        MultiplyTranspose(output_m, weights, inputTr);
 
  364        AddConvBiases(output_m, biases);
 
  368    TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(input.GetFirstSize()));
 
  372    Copy(inputActivationFunc, output);
 
  375    ActivationFunctionForward(output, activFunc, ActivationDescriptor_t());
 
  379 template <
typename AFloat>
 
  380 void TCpu<AFloat>::ConvLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
 
  381                                      TCpuMatrix<AFloat> &weightGradients, TCpuMatrix<AFloat> &biasGradients,
 
  382                                      TCpuTensor<AFloat> &inputActivationFunc,
 
  383                                      TCpuTensor<AFloat> &activationGradients,
 
  384                                      const TCpuMatrix<AFloat> &weights,
 
  385                                      const TCpuTensor<AFloat> &activationsBackward,
 
  386                                      const Tensor_t & outputTensor,
 
  387                                      EActivationFunction activFunc,
 
  388                                      const ConvDescriptors_t & ,
 
  390                                      size_t batchSize,   
size_t inputHeight,
 
  391                                      size_t inputWidth,  
size_t depth,
 
  392                                      size_t height,      
size_t width,
 
  393                                      size_t filterDepth, 
size_t filterHeight,
 
  394                                      size_t filterWidth, 
size_t nLocalViews)
 
  404    Tensor_t df(activationGradients.GetShape() );   
 
  405    ActivationFunctionBackward(df, outputTensor, activationGradients, inputActivationFunc,
 
  406                               activFunc, ActivationDescriptor_t() );
 
  412    CalculateConvActivationGradients(activationGradientsBackward, df, weights, batchSize, inputHeight, inputWidth, depth,
 
  413                                     height, width, filterDepth, filterHeight, filterWidth);
 
  416    CalculateConvWeightGradients(weightGradients, df, activationsBackward, batchSize, inputHeight, inputWidth, depth,
 
  417                                 height, width, filterDepth, filterHeight, filterWidth, nLocalViews);
 
  420    CalculateConvBiasGradients(biasGradients, df, batchSize, depth, nLocalViews);
 
  424 template <
typename AFloat>
 
  425 void TCpu<AFloat>::CalculateConvActivationGradients(TCpuTensor<AFloat> &activationGradientsBackward,
 
  426                                                     const TCpuTensor<AFloat> &df,
 
  427                                                     const TCpuMatrix<AFloat> &weights, 
size_t batchSize,
 
  428                                                     size_t inputHeight, 
size_t inputWidth, 
size_t depth, 
size_t height,
 
  429                                                     size_t width, 
size_t filterDepth, 
size_t filterHeight,
 
  432    if (activationGradientsBackward.GetSize() == 0) 
return;
 
  435    activationGradientsBackward.Zero();
 
  442    TCpuMatrix<AFloat> rotWeights(filterDepth, depth * filterHeight * filterWidth);
 
  443    RotateWeights(rotWeights, weights, filterDepth, filterHeight, filterWidth, weights.GetNrows());
 
  447    size_t tempZeroPaddingHeight = (size_t)(floor((inputHeight - height + filterHeight - 1) / 2));
 
  448    size_t tempZeroPaddingWidth = (size_t)(floor((inputWidth - width + filterWidth - 1) / 2));
 
  454    size_t tempNLocalViews = inputHeight * inputWidth;
 
  455    size_t tempNLocalViewPixels = depth * filterHeight * filterWidth;
 
  457    size_t tempStrideRows = 1;
 
  458    size_t tempStrideCols = 1;
 
  462     std::vector<int> vIndices( tempNLocalViews * tempNLocalViewPixels );
 
  463     Im2colIndices(vIndices, df.At(0).GetMatrix(), tempNLocalViews, height, width, filterHeight, filterWidth, tempStrideRows, tempStrideCols,
 
  464              tempZeroPaddingHeight, tempZeroPaddingWidth);
 
  468     R__ASSERT(batchSize == df.GetFirstSize() );
 
  469     R__ASSERT(batchSize == activationGradientsBackward.GetFirstSize() );
 
  470     auto f = [&] (UInt_t i)
 
  475       TCpuMatrix<AFloat> dfTr(tempNLocalViews, tempNLocalViewPixels);
 
  477       Im2colFast(dfTr, df.At(i).GetMatrix(), vIndices);
 
  482       Matrix_t agb_m = activationGradientsBackward.At(i).GetMatrix();
 
  483       MultiplyTranspose(agb_m, rotWeights, dfTr);
 
  489     TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI( batchSize ) );
 
  493 template <
typename AFloat>
 
  494 void TCpu<AFloat>::CalculateConvWeightGradients(TCpuMatrix<AFloat> &weightGradients,
 
  495                                                 const TCpuTensor<AFloat> &df,
 
  496                                                 const TCpuTensor<AFloat> &activationsBackward,
 
  497                                                 size_t batchSize, 
size_t inputHeight, 
size_t inputWidth, 
size_t depth,
 
  498                                                 size_t height, 
size_t width, 
size_t filterDepth, 
size_t filterHeight,
 
  499                                                 size_t filterWidth, 
size_t nLocalViews)
 
  502    weightGradients.Zero();
 
  504    const size_t filterSize = filterHeight * filterWidth;
 
  505    const size_t nLocalViewPixels = filterDepth * filterHeight * filterWidth;
 
  506    R__ASSERT( weightGradients.GetNcols() == filterDepth * filterHeight * filterWidth);
 
  508    const size_t tempStrideRows = 1;
 
  509    const size_t tempStrideCols = 1;
 
  512    const size_t tempZeroPaddingHeight = (height - inputHeight + filterHeight - 1) / 2;
 
  513    const size_t tempZeroPaddingWidth = (width - inputWidth + filterWidth - 1) / 2;
 
  520    std::vector<int> vIndices(nLocalViews * nLocalViewPixels );
 
  521    Im2colIndices(vIndices, activationsBackward.At(0).GetMatrix(), nLocalViews, inputHeight, inputWidth, filterHeight , filterWidth,
 
  522              tempStrideRows, tempStrideCols, tempZeroPaddingHeight, tempZeroPaddingWidth);
 
  534    TCpuTensor<AFloat> vres( batchSize, depth, nLocalViewPixels);
 
  536    auto fmap = [&](
int i) {
 
  539       TCpuMatrix<AFloat> xTr(nLocalViews, nLocalViewPixels);
 
  540       TCpuMatrix<AFloat> res(depth, nLocalViewPixels);
 
  548       Im2colFast(xTr, activationsBackward.At(i).GetMatrix(), vIndices);
 
  553       Matrix_t mres = vres.At(i).GetMatrix();
 
  554       Multiply( mres, df.At(i).GetMatrix(), xTr);
 
  561    TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(fmap, ROOT::TSeqI( batchSize ) );
 
  564       R__ASSERT(vres.GetFirstSize() == batchSize);
 
  565       for (
size_t i = 0; i < batchSize; i++) {
 
  567          Matrix_t vres_m = vres.At(i).GetMatrix();
 
  568          for (
size_t j = 0; j < depth; j++) {
 
  569             for (
size_t k = 0; k < filterDepth; k++) {
 
  570                size_t kOffset = k * filterSize;
 
  571                for (
size_t l = 0; l < filterSize; l++) {
 
  573                   weightGradients(j, kOffset + l) += vres_m(j,  kOffset + l);
 
  586 template <
typename AFloat>
 
  587 void TCpu<AFloat>::CalculateConvBiasGradients(TCpuMatrix<AFloat> &biasGradients, 
const TCpuTensor<AFloat> &df,
 
  588                                               size_t batchSize, 
size_t depth, 
size_t nLocalViews)
 
  590    biasGradients.Zero();
 
  591    for (
size_t i = 0; i < depth; i++) {
 
  593       for (
size_t j = 0; j < nLocalViews; j++) {
 
  594          for (
size_t k = 0; k < batchSize; k++) {
 
  599       biasGradients(i, 0) = sum;
 
  604 template <
typename AFloat>
 
  605 void TCpu<AFloat>::Downsample(TCpuTensor<AFloat> &tA, TCpuTensor<AFloat> &tB, 
const TCpuTensor<AFloat> &tC,
 
  606                               const PoolingDescriptors_t & ,
 
  607                               PoolingWorkspace_t & ,
 
  608                               size_t imgHeight, 
size_t imgWidth, 
size_t fltHeight, 
size_t fltWidth, 
size_t strideRows,
 
  613    assert( tA.GetFirstSize() == tC.GetFirstSize());
 
  614    for (
size_t ifirst = 0; ifirst < tC.GetFirstSize(); ++ifirst) {
 
  616       Matrix_t A = tA.At(ifirst).GetMatrix();
 
  617       Matrix_t B = tB.At(ifirst).GetMatrix();
 
  618       Matrix_t C = tC.At(ifirst).GetMatrix();
 
  621       int imgHeightBound = imgHeight - (fltHeight - 1) / 2 - 1;
 
  622       int imgWidthBound = imgWidth - (fltWidth - 1) / 2 - 1;
 
  623       size_t currLocalView = 0;
 
  626       for (
int i = fltHeight / 2; i <= imgHeightBound; i += strideRows) {
 
  627          for (
int j = fltWidth / 2; j <= imgWidthBound; j += strideCols) {
 
  629             for (
int m = 0; m < (Int_t)C.GetNrows(); m++) {
 
  630                AFloat value = -std::numeric_limits<AFloat>::max();
 
  632                for (
int k = i - fltHeight / 2; k <= Int_t(i + (fltHeight - 1) / 2); k++) {
 
  633                   for (
int l = j - fltWidth / 2; l <= Int_t(j + (fltWidth - 1) / 2); l++) {
 
  634                      if (C(m, k * imgWidth + l) > value) {
 
  635                         value = C(m, k * imgWidth + l);
 
  636                         B(m, currLocalView) = k * imgWidth + l;
 
  640                A(m, currLocalView) = value;
 
  649 template <
typename AFloat>
 
  650 void TCpu<AFloat>::MaxPoolLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
 
  651                                         const TCpuTensor<AFloat> &activationGradients,
 
  652                                         const TCpuTensor<AFloat> &indexMatrix,
 
  653                                         const TCpuTensor<AFloat> & ,
 
  654                                         const TCpuTensor<AFloat> & ,
 
  655                                         const PoolingDescriptors_t & ,
 
  656                                         PoolingWorkspace_t & ,
 
  666    assert( activationGradientsBackward.GetFirstSize() == activationGradients.GetFirstSize());
 
  667    for (
size_t l = 0; l < activationGradients.GetFirstSize(); ++l) {
 
  669       Matrix_t activationGradientsBackward_m = activationGradientsBackward.At(l).GetMatrix();
 
  670       Matrix_t activationGradients_m = activationGradients.At(l).GetMatrix();
 
  671       Matrix_t indexMatrix_m = indexMatrix.At(l).GetMatrix();
 
  673       size_t depth = activationGradientsBackward_m.GetNrows();
 
  675       for (
size_t j = 0; j < depth; j++) {
 
  677          for (
size_t t = 0; t < (size_t)activationGradientsBackward_m.GetNcols(); t++) {
 
  678             activationGradientsBackward_m(j, t) = 0;
 
  682          for (
size_t k = 0; k < nLocalViews; k++) {
 
  683             AFloat grad = activationGradients_m(j, k);
 
  684             size_t winningIdx = indexMatrix_m(j, k);
 
  685             activationGradientsBackward_m(j, winningIdx) += grad;
 
  692 template <
typename AFloat>
 
  693 TCpuTensor<AFloat> TCpu<AFloat>::BatchNormLayerReshapeTensor(
int axis, 
const TCpuTensor<AFloat> &x) {
 
  698       typename TCpuTensor<AFloat>::Shape_t newShape = { x.GetSize() / x.GetShape().front() , x.GetShape().front() }; 
 
  699       TCpuTensor<AFloat> xtmp(x.GetDeviceBuffer(), newShape, TCpuTensor<AFloat>::MemoryLayout::RowMajor);
 
  703    return  x.Reshape( { x.GetShape().front(), x.GetSize()/ x.GetShape().front()});
 
  708 template <
typename AFloat>
 
  709 void TCpu<AFloat>::BatchNormLayerForwardTraining(
int axis, 
const TCpuTensor<AFloat> &x,
 
  710                                                  TCpuTensor<AFloat> & y,
 
  711                                                  Matrix_t &gamma, Matrix_t &beta,
 
  714                                                  Matrix_t & iVariance,
 
  715                                                  Matrix_t & runningMeans,
 
  716                                                  Matrix_t & runningVars,
 
  717                                                  Scalar_t nTrainedBatches,
 
  718                                                  Scalar_t momentum, Scalar_t epsilon,
 
  719                                                  const TensorDescriptor_t &  )
 
  727    TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
 
  728    TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
 
  730    assert (input.GetShape().size() == 2);
 
  731    size_t n = input.GetShape()[0];   
 
  732    size_t d = input.GetShape()[1];   
 
  734    TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
 
  735    TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
 
  739    auto f = [&] (
size_t k)
 
  742       auto inputK = inputBuffer.GetSubBuffer(k * n, n);
 
  743       auto outputK = outputBuffer.GetSubBuffer(k * n, n);
 
  747       for (
size_t i = 0; i < n; i++) {
 
  748          AFloat xi = inputK[i];
 
  754       for (
size_t i = 0; i < n; i++) {
 
  755          AFloat xi = inputK[i];
 
  756          double xmu = xi - meanK;
 
  757          sq = sq + (xmu * xmu);
 
  758          outputK[i] = AFloat(xmu);
 
  761       variance(0,k) = sq / n;
 
  762       iVariance(0,k) = 1. / std::sqrt(variance(0,k) + epsilon);
 
  764       double iVK = iVariance(0, k);
 
  765       double gK = gamma(0, k);
 
  766       double bK = beta(0, k);
 
  767       for (
size_t i = 0; i < n; i++) {
 
  768          AFloat yi = outputK[i] ;
 
  769          outputK[i] = AFloat( gK * iVK * yi  + bK );
 
  775       if (nTrainedBatches == 0) {
 
  776          runningMeans(0,k) = mean(0,k);
 
  777          runningVars(0,k) = variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
 
  779          double decay = momentum;
 
  780          if (momentum < 0) decay = nTrainedBatches/Scalar_t(nTrainedBatches+1);
 
  781          runningMeans(0,k) = decay * runningMeans(0,k) + (1. - decay) * mean(0,k);
 
  782          runningVars(0,k) = decay * runningVars(0,k) + (1.-decay) * variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
 
  789    TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
 
  793 template <
typename AFloat>
 
  794 void TCpu<AFloat>::BatchNormLayerForwardInference(
int axis, 
const TCpuTensor<AFloat> &x,
 
  797                                                   TCpuTensor<AFloat> &y,
 
  798                                                   const Matrix_t & runningMeans,
 
  799                                                   const Matrix_t & runningVars,
 
  801                                                   const TensorDescriptor_t & )
 
  803    TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
 
  804    TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
 
  806    assert (input.GetShape().size() == 2);
 
  807    size_t n = input.GetShape()[0];   
 
  808    size_t d = input.GetShape()[1];
 
  810    TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
 
  811    TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
 
  813    auto f = [&] (
size_t k) {
 
  815       auto inputK = inputBuffer.GetSubBuffer(k * n, n);
 
  816       auto outputK = outputBuffer.GetSubBuffer(k * n, n);
 
  818       double gK = gamma(0, k);
 
  819       double bK = beta(0, k);
 
  820       double mK = runningMeans(0, k);
 
  821       double vK = 1. / (sqrt(runningVars(0, k) + epsilon));
 
  824       for (
size_t i = 0; i < n; i++) {
 
  825          AFloat xi = inputK[i];
 
  826          outputK[i] = AFloat( gK * (xi - mK) * vK + bK );
 
  830    TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
 
  834 template <
typename AFloat>
 
  835 void TCpu<AFloat>::BatchNormLayerBackward(
int axis, 
const TCpuTensor<AFloat> &x,
 
  836                                                  const TCpuTensor<AFloat> &dy,
 
  837                                                  TCpuTensor<AFloat> &dx,
 
  839                                                  Matrix_t &dgamma, Matrix_t &dbeta,
 
  840                                                  const Matrix_t & mean,
 
  841                                                  const Matrix_t & variance,
 
  842                                                  const Matrix_t & iVariance,
 
  844                                                  const TensorDescriptor_t & )
 
  846    TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
 
  847    TCpuTensor<AFloat> inputGrad = BatchNormLayerReshapeTensor(axis,dx);
 
  848    TCpuTensor<AFloat> outputGrad = BatchNormLayerReshapeTensor(axis,dy);
 
  850    assert (outputGrad.GetShape().size() == 2);
 
  851    size_t n = outputGrad.GetShape()[0];   
 
  852    size_t d = outputGrad.GetShape()[1];
 
  854    TCpuBuffer<AFloat> & inputBuffer = input.GetDeviceBuffer();
 
  855    TCpuBuffer<AFloat> & dyBuffer = outputGrad.GetDeviceBuffer();
 
  856    TCpuBuffer<AFloat> & dxBuffer = inputGrad.GetDeviceBuffer();
 
  860    auto f = [&] (
size_t k) {
 
  863       auto inputK = inputBuffer.GetSubBuffer(k * n, n);
 
  864       auto outputGradK = dyBuffer.GetSubBuffer(k * n, n);
 
  865       auto inputGradK = dxBuffer.GetSubBuffer(k * n, n);
 
  866       auto meanK = mean(0, k);
 
  867       for (
size_t i = 0; i < n; i++) {
 
  868          AFloat xi = inputK[i];
 
  869          double xhat = xi - meanK;
 
  870          dbeta(0, k) += outputGradK[i];
 
  871          dgamma(0, k) += outputGradK[i] * xhat;
 
  873       double npSumDy = dbeta(0, k);
 
  874       double npSumDyHMu = dgamma(0, k);
 
  875       dgamma(0, k) *= iVariance(0, k);
 
  878       double bterm = npSumDyHMu / (variance(0, k) + epsilon);
 
  879       double aterm = (1. / double(n) * gamma(0, k) * iVariance(0, k));
 
  880       for (
size_t i = 0; i < n; i++) {
 
  881          AFloat xi = inputK[i];
 
  882          AFloat dyi = outputGradK[i];
 
  883          double xmu = xi - meanK;
 
  884          inputGradK[i] = AFloat( aterm * (n * dyi - npSumDy - xmu * bterm) );
 
  888    TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
 
  892 template <
typename AFloat>
 
  893 void TCpu<AFloat>::Reshape(TCpuMatrix<AFloat> &A, 
const TCpuMatrix<AFloat> &B)
 
  895    size_t nColsA = A.GetNcols();
 
  896    size_t nColsB = B.GetNcols();
 
  898    for (
size_t i = 0; i < A.GetNrows(); i++) {
 
  899       for (
size_t j = 0; j < A.GetNcols(); j++) {
 
  900          size_t nElem = i * nColsA + j;
 
  901          A(i, j) = B(nElem / nColsB, nElem % nColsB);
 
  907 template <
typename AFloat>
 
  908 void TCpu<AFloat>::Flatten(TCpuTensor<AFloat> &A, 
const TCpuTensor<AFloat> &B )
 
  914    assert( B.GetShape().size() == 3  );
 
  915    assert( A.GetShape().size() == 3  );
 
  918    size_t bsize = B.GetFirstSize();
 
  919    size_t nRows = B.GetHSize();
 
  920    size_t nCols = B.GetWSize();
 
  922    assert (  A.GetFirstSize() == 1);
 
  923    assert (  A.GetHSize() == bsize);
 
  924    assert (  A.GetWSize() == nRows*nCols);
 
  926    for (
size_t i = 0; i < bsize; i++) {
 
  927       for (
size_t j = 0; j < nRows; j++) {
 
  928          for (
size_t k = 0; k < nCols; k++) {
 
  929             A( 0, i, j * nCols + k) = B(i, j, k);
 
  943 template <
typename AFloat>
 
  944 void TCpu<AFloat>::Deflatten(TCpuTensor<AFloat> &A, 
const TCpuTensor<AFloat> &B )
 
  947    assert( B.GetShape().size() == 3  );
 
  948    assert( A.GetShape().size() == 3  );
 
  950    size_t size = A.GetFirstSize();
 
  951    size_t nRows = A.GetHSize();
 
  952    size_t nCols = A.GetWSize();
 
  954    assert (  B.GetFirstSize() == 1);
 
  955    assert (  B.GetHSize() == size);
 
  956    assert (  B.GetWSize() == nRows*nCols);
 
  957    for (
size_t i = 0; i < (size_t)size; i++) {
 
  958       for (
size_t j = 0; j < (size_t)nRows; j++) {
 
  959          for (
size_t k = 0; k < (size_t)nCols; k++) {
 
  960                A(i, j, k) = B(0, i, j * nCols + k);
 
  967 template <
typename AFloat>
 
  968 void TCpu<AFloat>::Rearrange(Tensor_t &out, 
const Tensor_t &in)
 
  971    assert ( out.GetShape().size() == 3 && in.GetShape().size() == 3);
 
  974    size_t B = out.GetFirstSize();
 
  975    size_t T = out.GetCSize();  
 
  976    size_t D = out.GetWSize();  
 
  977    if ((T != in.GetFirstSize()) || (B != in.GetCSize()) || (D != in.GetWSize()) ) {
 
  978       std::cout << 
"Incompatible Dimensions\n" 
  979                 << in.GetFirstSize() << 
"x" << in.GetCSize() << 
"x" << in.GetWSize() << 
" --> " << B << 
"x" << T << 
"x" 
  984    for (
size_t i = 0; i < B; ++i) {
 
  985       for (
size_t j = 0; j < T; ++j) {
 
  986          for (
size_t k = 0; k < D; ++k) {
 
  987             out( i, j, k ) = in( j, i, k);