32 template <
typename AFloat>
33 void TCpu<AFloat>::MultiplyTranspose(TCpuMatrix<AFloat> &output,
const TCpuMatrix<AFloat> &input,
34 const TCpuMatrix<AFloat> &Weights)
37 int m = (int)input.GetNrows();
38 int k = (int)input.GetNcols();
39 int n = (int)Weights.GetNrows();
41 if ((
int)output.GetNrows() != m) {
42 Error(
"MultiplyTranspose",
"Invalid input - output rows - input: %d != output : %d",m, (
int) output.GetNrows());
43 R__ASSERT((
int) output.GetNrows() == m);
45 if ((
int)output.GetNcols() != n) {
46 Error(
"MultiplyTranspose",
"Invalid output cols or weight rows - output cols: %d != weight rows : %d",(
int) output.GetNcols(),n);
47 R__ASSERT((
int) output.GetNcols() == n);
49 if ((
int)Weights.GetNcols() != k) {
50 Error(
"MultiplyTranspose",
"Invalid input cols or weight cols - input cols: %d != weight cols : %d", k, (
int) Weights.GetNcols());
51 R__ASSERT((
int) Weights.GetNcols() == k);
62 const AFloat *A = input.GetRawDataPointer();
63 const AFloat *B = Weights.GetRawDataPointer();
64 AFloat *C = output.GetRawDataPointer();
66 ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, A, &m, B, &n, &beta, C, &m);
68 TMatrixT<AFloat> tmp(output.GetNrows(), output.GetNcols());
69 tmp.MultT( input,Weights);
74 template <
typename AFloat>
75 void TCpu<AFloat>::AddRowWise(TCpuMatrix<AFloat> &output,
const TCpuMatrix<AFloat> &biases)
78 int m = (int)output.GetNrows();
79 int n = (int)output.GetNcols();
84 AFloat *A = output.GetRawDataPointer();
85 const AFloat *x = TCpuMatrix<AFloat>::GetOnePointer();
86 const AFloat *y = biases.GetRawDataPointer();
88 R__ASSERT(m <= (
int)TCpuMatrix<AFloat>::GetOnePointerSize());
89 R__ASSERT(n <= (
int)(biases.GetNcols()*biases.GetNrows()));
91 ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
94 TReference<AFloat>::AddRowWise(tmp, biases);
99 template <
typename AFloat>
100 void TCpu<AFloat>::Backward(TCpuTensor<AFloat> &activationGradientsBackward, TCpuMatrix<AFloat> &weightGradients,
101 TCpuMatrix<AFloat> &biasGradients,
const TCpuTensor<AFloat> &df,
102 const TCpuTensor<AFloat> &,
const TCpuMatrix<AFloat> &weights,
103 const TCpuTensor<AFloat> &activationsBackward)
108 Matrix_t df_m = df.GetMatrix();
111 if (activationGradientsBackward.GetSize() > 0 ) {
113 Matrix_t activationGradientsBackward_m = activationGradientsBackward.GetMatrix();
115 Multiply(activationGradientsBackward_m, df_m, weights);
119 if (weightGradients.GetNoElements() > 0) TransposeMultiply(weightGradients, df_m, activationsBackward.GetMatrix());
125 if (biasGradients.GetNoElements() > 0) SumColumns(biasGradients, df_m);
131 template <
typename AFloat>
132 void TCpu<AFloat>::Im2col(TCpuMatrix<AFloat> &A,
const TCpuMatrix<AFloat> &B,
size_t imgHeight,
size_t imgWidth,
133 size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
134 size_t zeroPaddingHeight,
size_t zeroPaddingWidth)
138 int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
139 int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
140 size_t currLocalView = 0;
142 const int halfFltHeight = fltHeight / 2;
143 const int halfFltWidth = fltWidth / 2;
144 const int halfFltHeightM1 = (fltHeight - 1) / 2;
145 const int halfFltWidthM1 = (fltWidth - 1) / 2;
146 const int nRowsInput = B.GetNrows();
147 const int nColsInput = B.GetNcols();
148 const int nRowsOutput = A.GetNrows();
149 const int nColsOutput = A.GetNcols();
152 for (
int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
153 for (
int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
154 size_t currLocalViewPixel = 0;
157 R__ASSERT((
int) currLocalView < nRowsOutput );
159 for (
int m = 0; m < nRowsInput; m++) {
160 for (
int k = i - halfFltHeight ; k <= Int_t(i + halfFltHeightM1 ); k++) {
161 int kstep = k * imgWidth;
162 for (
int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
165 R__ASSERT((
int) currLocalViewPixel < nColsOutput );
167 if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >= nColsInput)
168 A(currLocalView, currLocalViewPixel++) = 0;
170 A(currLocalView, currLocalViewPixel++) = B(m, kstep + l);
182 template <
typename AFloat>
183 void TCpu<AFloat>::Im2colIndices(std::vector<int> &V,
const TCpuMatrix<AFloat> &B,
size_t nLocalViews,
size_t imgHeight,
size_t imgWidth,
184 size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
185 size_t zeroPaddingHeight,
size_t zeroPaddingWidth)
189 int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
190 int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
191 size_t currLocalView = 0;
193 const int halfFltHeight = fltHeight / 2;
194 const int halfFltWidth = fltWidth / 2;
195 const int halfFltHeightM1 = (fltHeight - 1) / 2;
196 const int halfFltWidthM1 = (fltWidth - 1) / 2;
197 const int nRowsInput = B.GetNrows();
198 const int nColsInput = B.GetNcols();
199 const size_t nSizeOutput = V.size();
200 const int npixels = nRowsInput * fltHeight * fltWidth;
205 for (
int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
206 for (
int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
207 size_t currLocalViewPixel = 0;
212 for (
int m = 0; m < nRowsInput; m++) {
213 for (
int k = i - halfFltHeight ; k <= Int_t(i + halfFltHeightM1 ); k++) {
214 int kstep = k * imgWidth;
215 for (
int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
219 R__ASSERT(currLocalView * npixels + currLocalViewPixel < nSizeOutput );
220 if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >= nColsInput)
222 V[currLocalViewPixel * nLocalViews + currLocalView] = -1;
224 V[currLocalViewPixel * nLocalViews + currLocalView]= ( kstep + l) * nRowsInput + m;
226 currLocalViewPixel++;
234 template <
typename AFloat>
235 void TCpu<AFloat>::Im2colFast(TCpuMatrix<AFloat> &A,
const TCpuMatrix<AFloat> &B,
const std::vector<int> &V)
238 R__ASSERT( n == A.GetNcols() * A.GetNrows() );
239 AFloat * a = A.GetRawDataPointer();
240 const AFloat * b = B.GetRawDataPointer();
245 const size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(n);
247 auto f = [&](UInt_t workerID)
249 for (
size_t j = 0; j < nsteps; ++j) {
250 size_t ii = workerID+j;
253 if (idx >= 0) a[ii] = b[idx];
259 A.GetThreadExecutor().Foreach(f, ROOT::TSeqI(0,n,nsteps) );
263 for (
size_t ii = 0; ii < n; ++ii) {
265 if (idx >= 0) a[ii] = b[idx];
272 template <
typename AFloat>
273 void TCpu<AFloat>::RotateWeights(TCpuMatrix<AFloat> &A,
const TCpuMatrix<AFloat> &B,
size_t filterDepth,
274 size_t filterHeight,
size_t filterWidth,
size_t numFilters)
276 size_t jump = filterHeight * filterWidth;
277 for (
size_t j = 0; j < filterDepth; j++) {
278 for (
size_t k = 0; k < numFilters; k++) {
279 for (
size_t i = 0; i < jump; i++) {
280 A(j, k * jump + i) = B(k, ((j + 1) * jump - 1) - i);
288 template <
typename AFloat>
289 void TCpu<AFloat>::AddConvBiases(TCpuMatrix<AFloat> &output,
const TCpuMatrix<AFloat> &biases)
291 #ifdef R__HAS_TMVACPU
292 int m = (int)output.GetNrows();
293 int n = (int)output.GetNcols();
298 AFloat *A = output.GetRawDataPointer();
299 const AFloat *x = biases.GetRawDataPointer();
300 const AFloat *y = TCpuMatrix<AFloat>::GetOnePointer();
302 R__ASSERT(m <= (
int)biases.GetNoElements() );
303 R__ASSERT(n <= (
int)TCpuMatrix<AFloat>::GetOnePointerSize() );
305 ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
307 TMatrixT<AFloat> tmp;
308 TReference<AFloat>::AddConvBiases(tmp, biases);
313 template<
typename AFloat>
314 size_t TCpu<AFloat>::calculateDimension(
size_t imgDim,
size_t fltDim,
size_t padding,
size_t stride)
316 size_t temp = imgDim - fltDim + 2 * padding;
317 if (temp % stride || temp + stride <= 0) {
318 Fatal(
"calculateDimension",
"Not compatible hyper parameters for layer - (imageDim, filterDim, padding, stride) "
319 "%zu, %zu, %zu, %zu", imgDim, fltDim, padding, stride);
321 return temp / stride + 1;
325 template <
typename AFloat>
326 void TCpu<AFloat>::ConvLayerForward(TCpuTensor<AFloat> & output,
327 TCpuTensor<AFloat> & inputActivationFunc,
328 const TCpuTensor<AFloat> &input,
329 const TCpuMatrix<AFloat> &weights,
const TCpuMatrix<AFloat> & biases,
330 const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
331 TCpuTensor<AFloat> & ,
332 const ConvDescriptors_t & ,
335 size_t height = calculateDimension(params.inputHeight, params.filterHeight, params.paddingHeight, params.strideRows);
336 size_t width = calculateDimension(params.inputWidth, params.filterWidth, params.paddingWidth, params.strideCols);
337 size_t nLocalViews = height * width;
338 size_t nLocalViewPixels = params.inputDepth * params.filterHeight * params.filterWidth;
340 R__ASSERT( input.GetSize() > 0);
341 std::vector<int> forwardIndices(nLocalViews * nLocalViewPixels);
342 Im2colIndices(forwardIndices, input.At(0).GetMatrix(), nLocalViews, params.inputHeight, params.inputWidth, params.filterHeight,
343 params.filterWidth, params.strideRows, params.strideCols, params.paddingHeight, params.paddingWidth);
346 TCpuMatrix<AFloat>::InitializeOneVector(nLocalViews);
347 TCpuMatrix<AFloat>::InitializeOneVector(output.GetWSize());
350 auto f = [&] (UInt_t i)
357 TCpuMatrix<AFloat> inputTr(nLocalViews, nLocalViewPixels);
360 Im2colFast(inputTr, input.At(i).GetMatrix(), forwardIndices);
362 Matrix_t output_m = output.At(i).GetMatrix();
363 MultiplyTranspose(output_m, weights, inputTr);
364 AddConvBiases(output_m, biases);
368 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(input.GetFirstSize()));
372 Copy(inputActivationFunc, output);
375 ActivationFunctionForward(output, activFunc, ActivationDescriptor_t());
379 template <
typename AFloat>
380 void TCpu<AFloat>::ConvLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
381 TCpuMatrix<AFloat> &weightGradients, TCpuMatrix<AFloat> &biasGradients,
382 TCpuTensor<AFloat> &inputActivationFunc,
383 TCpuTensor<AFloat> &activationGradients,
384 const TCpuMatrix<AFloat> &weights,
385 const TCpuTensor<AFloat> &activationsBackward,
386 const Tensor_t & outputTensor,
387 EActivationFunction activFunc,
388 const ConvDescriptors_t & ,
390 size_t batchSize,
size_t inputHeight,
391 size_t inputWidth,
size_t depth,
392 size_t height,
size_t width,
393 size_t filterDepth,
size_t filterHeight,
394 size_t filterWidth,
size_t nLocalViews)
404 Tensor_t df(activationGradients.GetShape() );
405 ActivationFunctionBackward(df, outputTensor, activationGradients, inputActivationFunc,
406 activFunc, ActivationDescriptor_t() );
412 CalculateConvActivationGradients(activationGradientsBackward, df, weights, batchSize, inputHeight, inputWidth, depth,
413 height, width, filterDepth, filterHeight, filterWidth);
416 CalculateConvWeightGradients(weightGradients, df, activationsBackward, batchSize, inputHeight, inputWidth, depth,
417 height, width, filterDepth, filterHeight, filterWidth, nLocalViews);
420 CalculateConvBiasGradients(biasGradients, df, batchSize, depth, nLocalViews);
424 template <
typename AFloat>
425 void TCpu<AFloat>::CalculateConvActivationGradients(TCpuTensor<AFloat> &activationGradientsBackward,
426 const TCpuTensor<AFloat> &df,
427 const TCpuMatrix<AFloat> &weights,
size_t batchSize,
428 size_t inputHeight,
size_t inputWidth,
size_t depth,
size_t height,
429 size_t width,
size_t filterDepth,
size_t filterHeight,
432 if (activationGradientsBackward.GetSize() == 0)
return;
435 activationGradientsBackward.Zero();
442 TCpuMatrix<AFloat> rotWeights(filterDepth, depth * filterHeight * filterWidth);
443 RotateWeights(rotWeights, weights, filterDepth, filterHeight, filterWidth, weights.GetNrows());
447 size_t tempZeroPaddingHeight = (size_t)(floor((inputHeight - height + filterHeight - 1) / 2));
448 size_t tempZeroPaddingWidth = (size_t)(floor((inputWidth - width + filterWidth - 1) / 2));
454 size_t tempNLocalViews = inputHeight * inputWidth;
455 size_t tempNLocalViewPixels = depth * filterHeight * filterWidth;
457 size_t tempStrideRows = 1;
458 size_t tempStrideCols = 1;
462 std::vector<int> vIndices( tempNLocalViews * tempNLocalViewPixels );
463 Im2colIndices(vIndices, df.At(0).GetMatrix(), tempNLocalViews, height, width, filterHeight, filterWidth, tempStrideRows, tempStrideCols,
464 tempZeroPaddingHeight, tempZeroPaddingWidth);
468 R__ASSERT(batchSize == df.GetFirstSize() );
469 R__ASSERT(batchSize == activationGradientsBackward.GetFirstSize() );
470 auto f = [&] (UInt_t i)
475 TCpuMatrix<AFloat> dfTr(tempNLocalViews, tempNLocalViewPixels);
477 Im2colFast(dfTr, df.At(i).GetMatrix(), vIndices);
482 Matrix_t agb_m = activationGradientsBackward.At(i).GetMatrix();
483 MultiplyTranspose(agb_m, rotWeights, dfTr);
489 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI( batchSize ) );
493 template <
typename AFloat>
494 void TCpu<AFloat>::CalculateConvWeightGradients(TCpuMatrix<AFloat> &weightGradients,
495 const TCpuTensor<AFloat> &df,
496 const TCpuTensor<AFloat> &activationsBackward,
497 size_t batchSize,
size_t inputHeight,
size_t inputWidth,
size_t depth,
498 size_t height,
size_t width,
size_t filterDepth,
size_t filterHeight,
499 size_t filterWidth,
size_t nLocalViews)
502 weightGradients.Zero();
504 const size_t filterSize = filterHeight * filterWidth;
505 const size_t nLocalViewPixels = filterDepth * filterHeight * filterWidth;
506 R__ASSERT( weightGradients.GetNcols() == filterDepth * filterHeight * filterWidth);
508 const size_t tempStrideRows = 1;
509 const size_t tempStrideCols = 1;
512 const size_t tempZeroPaddingHeight = (height - inputHeight + filterHeight - 1) / 2;
513 const size_t tempZeroPaddingWidth = (width - inputWidth + filterWidth - 1) / 2;
520 std::vector<int> vIndices(nLocalViews * nLocalViewPixels );
521 Im2colIndices(vIndices, activationsBackward.At(0).GetMatrix(), nLocalViews, inputHeight, inputWidth, filterHeight , filterWidth,
522 tempStrideRows, tempStrideCols, tempZeroPaddingHeight, tempZeroPaddingWidth);
534 TCpuTensor<AFloat> vres( batchSize, depth, nLocalViewPixels);
536 auto fmap = [&](
int i) {
539 TCpuMatrix<AFloat> xTr(nLocalViews, nLocalViewPixels);
540 TCpuMatrix<AFloat> res(depth, nLocalViewPixels);
548 Im2colFast(xTr, activationsBackward.At(i).GetMatrix(), vIndices);
553 Matrix_t mres = vres.At(i).GetMatrix();
554 Multiply( mres, df.At(i).GetMatrix(), xTr);
561 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(fmap, ROOT::TSeqI( batchSize ) );
564 R__ASSERT(vres.GetFirstSize() == batchSize);
565 for (
size_t i = 0; i < batchSize; i++) {
567 Matrix_t vres_m = vres.At(i).GetMatrix();
568 for (
size_t j = 0; j < depth; j++) {
569 for (
size_t k = 0; k < filterDepth; k++) {
570 size_t kOffset = k * filterSize;
571 for (
size_t l = 0; l < filterSize; l++) {
573 weightGradients(j, kOffset + l) += vres_m(j, kOffset + l);
586 template <
typename AFloat>
587 void TCpu<AFloat>::CalculateConvBiasGradients(TCpuMatrix<AFloat> &biasGradients,
const TCpuTensor<AFloat> &df,
588 size_t batchSize,
size_t depth,
size_t nLocalViews)
590 biasGradients.Zero();
591 for (
size_t i = 0; i < depth; i++) {
593 for (
size_t j = 0; j < nLocalViews; j++) {
594 for (
size_t k = 0; k < batchSize; k++) {
599 biasGradients(i, 0) = sum;
604 template <
typename AFloat>
605 void TCpu<AFloat>::Downsample(TCpuTensor<AFloat> &tA, TCpuTensor<AFloat> &tB,
const TCpuTensor<AFloat> &tC,
606 const PoolingDescriptors_t & ,
607 PoolingWorkspace_t & ,
608 size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
size_t fltWidth,
size_t strideRows,
613 assert( tA.GetFirstSize() == tC.GetFirstSize());
614 for (
size_t ifirst = 0; ifirst < tC.GetFirstSize(); ++ifirst) {
616 Matrix_t A = tA.At(ifirst).GetMatrix();
617 Matrix_t B = tB.At(ifirst).GetMatrix();
618 Matrix_t C = tC.At(ifirst).GetMatrix();
621 int imgHeightBound = imgHeight - (fltHeight - 1) / 2 - 1;
622 int imgWidthBound = imgWidth - (fltWidth - 1) / 2 - 1;
623 size_t currLocalView = 0;
626 for (
int i = fltHeight / 2; i <= imgHeightBound; i += strideRows) {
627 for (
int j = fltWidth / 2; j <= imgWidthBound; j += strideCols) {
629 for (
int m = 0; m < (Int_t)C.GetNrows(); m++) {
630 AFloat value = -std::numeric_limits<AFloat>::max();
632 for (
int k = i - fltHeight / 2; k <= Int_t(i + (fltHeight - 1) / 2); k++) {
633 for (
int l = j - fltWidth / 2; l <= Int_t(j + (fltWidth - 1) / 2); l++) {
634 if (C(m, k * imgWidth + l) > value) {
635 value = C(m, k * imgWidth + l);
636 B(m, currLocalView) = k * imgWidth + l;
640 A(m, currLocalView) = value;
649 template <
typename AFloat>
650 void TCpu<AFloat>::MaxPoolLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
651 const TCpuTensor<AFloat> &activationGradients,
652 const TCpuTensor<AFloat> &indexMatrix,
653 const TCpuTensor<AFloat> & ,
654 const TCpuTensor<AFloat> & ,
655 const PoolingDescriptors_t & ,
656 PoolingWorkspace_t & ,
666 assert( activationGradientsBackward.GetFirstSize() == activationGradients.GetFirstSize());
667 for (
size_t l = 0; l < activationGradients.GetFirstSize(); ++l) {
669 Matrix_t activationGradientsBackward_m = activationGradientsBackward.At(l).GetMatrix();
670 Matrix_t activationGradients_m = activationGradients.At(l).GetMatrix();
671 Matrix_t indexMatrix_m = indexMatrix.At(l).GetMatrix();
673 size_t depth = activationGradientsBackward_m.GetNrows();
675 for (
size_t j = 0; j < depth; j++) {
677 for (
size_t t = 0; t < (size_t)activationGradientsBackward_m.GetNcols(); t++) {
678 activationGradientsBackward_m(j, t) = 0;
682 for (
size_t k = 0; k < nLocalViews; k++) {
683 AFloat grad = activationGradients_m(j, k);
684 size_t winningIdx = indexMatrix_m(j, k);
685 activationGradientsBackward_m(j, winningIdx) += grad;
692 template <
typename AFloat>
693 TCpuTensor<AFloat> TCpu<AFloat>::BatchNormLayerReshapeTensor(
int axis,
const TCpuTensor<AFloat> &x) {
698 typename TCpuTensor<AFloat>::Shape_t newShape = { x.GetSize() / x.GetShape().front() , x.GetShape().front() };
699 TCpuTensor<AFloat> xtmp(x.GetDeviceBuffer(), newShape, TCpuTensor<AFloat>::MemoryLayout::RowMajor);
703 return x.Reshape( { x.GetShape().front(), x.GetSize()/ x.GetShape().front()});
708 template <
typename AFloat>
709 void TCpu<AFloat>::BatchNormLayerForwardTraining(
int axis,
const TCpuTensor<AFloat> &x,
710 TCpuTensor<AFloat> & y,
711 Matrix_t &gamma, Matrix_t &beta,
714 Matrix_t & iVariance,
715 Matrix_t & runningMeans,
716 Matrix_t & runningVars,
717 Scalar_t nTrainedBatches,
718 Scalar_t momentum, Scalar_t epsilon,
719 const TensorDescriptor_t & )
727 TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
728 TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
730 assert (input.GetShape().size() == 2);
731 size_t n = input.GetShape()[0];
732 size_t d = input.GetShape()[1];
734 TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
735 TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
739 auto f = [&] (
size_t k)
742 auto inputK = inputBuffer.GetSubBuffer(k * n, n);
743 auto outputK = outputBuffer.GetSubBuffer(k * n, n);
747 for (
size_t i = 0; i < n; i++) {
748 AFloat xi = inputK[i];
754 for (
size_t i = 0; i < n; i++) {
755 AFloat xi = inputK[i];
756 double xmu = xi - meanK;
757 sq = sq + (xmu * xmu);
758 outputK[i] = AFloat(xmu);
761 variance(0,k) = sq / n;
762 iVariance(0,k) = 1. / std::sqrt(variance(0,k) + epsilon);
764 double iVK = iVariance(0, k);
765 double gK = gamma(0, k);
766 double bK = beta(0, k);
767 for (
size_t i = 0; i < n; i++) {
768 AFloat yi = outputK[i] ;
769 outputK[i] = AFloat( gK * iVK * yi + bK );
775 if (nTrainedBatches == 0) {
776 runningMeans(0,k) = mean(0,k);
777 runningVars(0,k) = variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
779 double decay = momentum;
780 if (momentum < 0) decay = nTrainedBatches/Scalar_t(nTrainedBatches+1);
781 runningMeans(0,k) = decay * runningMeans(0,k) + (1. - decay) * mean(0,k);
782 runningVars(0,k) = decay * runningVars(0,k) + (1.-decay) * variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
789 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
793 template <
typename AFloat>
794 void TCpu<AFloat>::BatchNormLayerForwardInference(
int axis,
const TCpuTensor<AFloat> &x,
797 TCpuTensor<AFloat> &y,
798 const Matrix_t & runningMeans,
799 const Matrix_t & runningVars,
801 const TensorDescriptor_t & )
803 TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
804 TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
806 assert (input.GetShape().size() == 2);
807 size_t n = input.GetShape()[0];
808 size_t d = input.GetShape()[1];
810 TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
811 TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
813 auto f = [&] (
size_t k) {
815 auto inputK = inputBuffer.GetSubBuffer(k * n, n);
816 auto outputK = outputBuffer.GetSubBuffer(k * n, n);
818 double gK = gamma(0, k);
819 double bK = beta(0, k);
820 double mK = runningMeans(0, k);
821 double vK = 1. / (sqrt(runningVars(0, k) + epsilon));
824 for (
size_t i = 0; i < n; i++) {
825 AFloat xi = inputK[i];
826 outputK[i] = AFloat( gK * (xi - mK) * vK + bK );
830 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
834 template <
typename AFloat>
835 void TCpu<AFloat>::BatchNormLayerBackward(
int axis,
const TCpuTensor<AFloat> &x,
836 const TCpuTensor<AFloat> &dy,
837 TCpuTensor<AFloat> &dx,
839 Matrix_t &dgamma, Matrix_t &dbeta,
840 const Matrix_t & mean,
841 const Matrix_t & variance,
842 const Matrix_t & iVariance,
844 const TensorDescriptor_t & )
846 TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
847 TCpuTensor<AFloat> inputGrad = BatchNormLayerReshapeTensor(axis,dx);
848 TCpuTensor<AFloat> outputGrad = BatchNormLayerReshapeTensor(axis,dy);
850 assert (outputGrad.GetShape().size() == 2);
851 size_t n = outputGrad.GetShape()[0];
852 size_t d = outputGrad.GetShape()[1];
854 TCpuBuffer<AFloat> & inputBuffer = input.GetDeviceBuffer();
855 TCpuBuffer<AFloat> & dyBuffer = outputGrad.GetDeviceBuffer();
856 TCpuBuffer<AFloat> & dxBuffer = inputGrad.GetDeviceBuffer();
860 auto f = [&] (
size_t k) {
863 auto inputK = inputBuffer.GetSubBuffer(k * n, n);
864 auto outputGradK = dyBuffer.GetSubBuffer(k * n, n);
865 auto inputGradK = dxBuffer.GetSubBuffer(k * n, n);
866 auto meanK = mean(0, k);
867 for (
size_t i = 0; i < n; i++) {
868 AFloat xi = inputK[i];
869 double xhat = xi - meanK;
870 dbeta(0, k) += outputGradK[i];
871 dgamma(0, k) += outputGradK[i] * xhat;
873 double npSumDy = dbeta(0, k);
874 double npSumDyHMu = dgamma(0, k);
875 dgamma(0, k) *= iVariance(0, k);
878 double bterm = npSumDyHMu / (variance(0, k) + epsilon);
879 double aterm = (1. / double(n) * gamma(0, k) * iVariance(0, k));
880 for (
size_t i = 0; i < n; i++) {
881 AFloat xi = inputK[i];
882 AFloat dyi = outputGradK[i];
883 double xmu = xi - meanK;
884 inputGradK[i] = AFloat( aterm * (n * dyi - npSumDy - xmu * bterm) );
888 TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
892 template <
typename AFloat>
893 void TCpu<AFloat>::Reshape(TCpuMatrix<AFloat> &A,
const TCpuMatrix<AFloat> &B)
895 size_t nColsA = A.GetNcols();
896 size_t nColsB = B.GetNcols();
898 for (
size_t i = 0; i < A.GetNrows(); i++) {
899 for (
size_t j = 0; j < A.GetNcols(); j++) {
900 size_t nElem = i * nColsA + j;
901 A(i, j) = B(nElem / nColsB, nElem % nColsB);
907 template <
typename AFloat>
908 void TCpu<AFloat>::Flatten(TCpuTensor<AFloat> &A,
const TCpuTensor<AFloat> &B )
914 assert( B.GetShape().size() == 3 );
915 assert( A.GetShape().size() == 3 );
918 size_t bsize = B.GetFirstSize();
919 size_t nRows = B.GetHSize();
920 size_t nCols = B.GetWSize();
922 assert ( A.GetFirstSize() == 1);
923 assert ( A.GetHSize() == bsize);
924 assert ( A.GetWSize() == nRows*nCols);
926 for (
size_t i = 0; i < bsize; i++) {
927 for (
size_t j = 0; j < nRows; j++) {
928 for (
size_t k = 0; k < nCols; k++) {
929 A( 0, i, j * nCols + k) = B(i, j, k);
943 template <
typename AFloat>
944 void TCpu<AFloat>::Deflatten(TCpuTensor<AFloat> &A,
const TCpuTensor<AFloat> &B )
947 assert( B.GetShape().size() == 3 );
948 assert( A.GetShape().size() == 3 );
950 size_t size = A.GetFirstSize();
951 size_t nRows = A.GetHSize();
952 size_t nCols = A.GetWSize();
954 assert ( B.GetFirstSize() == 1);
955 assert ( B.GetHSize() == size);
956 assert ( B.GetWSize() == nRows*nCols);
957 for (
size_t i = 0; i < (size_t)size; i++) {
958 for (
size_t j = 0; j < (size_t)nRows; j++) {
959 for (
size_t k = 0; k < (size_t)nCols; k++) {
960 A(i, j, k) = B(0, i, j * nCols + k);
967 template <
typename AFloat>
968 void TCpu<AFloat>::Rearrange(Tensor_t &out,
const Tensor_t &in)
971 assert ( out.GetShape().size() == 3 && in.GetShape().size() == 3);
974 size_t B = out.GetFirstSize();
975 size_t T = out.GetCSize();
976 size_t D = out.GetWSize();
977 if ((T != in.GetFirstSize()) || (B != in.GetCSize()) || (D != in.GetWSize()) ) {
978 std::cout <<
"Incompatible Dimensions\n"
979 << in.GetFirstSize() <<
"x" << in.GetCSize() <<
"x" << in.GetWSize() <<
" --> " << B <<
"x" << T <<
"x"
984 for (
size_t i = 0; i < B; ++i) {
985 for (
size_t j = 0; j < T; ++j) {
986 for (
size_t k = 0; k < D; ++k) {
987 out( i, j, k ) = in( j, i, k);