Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
Propagation.hxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 10/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 // Implementation of the functions required for the forward and //
14 // backward propagation of activations through a neural network for //
15 // the reference implementation. //
16 //////////////////////////////////////////////////////////////////////
17 
19 
20 
21 #ifdef R__HAS_TMVACPU
23 #else
25 #endif
26 
27 namespace TMVA {
28 namespace DNN {
29 
30 
31 
32 template <typename AFloat>
33 void TCpu<AFloat>::MultiplyTranspose(TCpuMatrix<AFloat> &output, const TCpuMatrix<AFloat> &input,
34  const TCpuMatrix<AFloat> &Weights)
35 {
36 
37  int m = (int)input.GetNrows();
38  int k = (int)input.GetNcols();
39  int n = (int)Weights.GetNrows();
40 
41  if ((int)output.GetNrows() != m) {
42  Error("MultiplyTranspose","Invalid input - output rows - input: %d != output : %d",m, (int) output.GetNrows());
43  R__ASSERT((int) output.GetNrows() == m);
44  }
45  if ((int)output.GetNcols() != n) {
46  Error("MultiplyTranspose","Invalid output cols or weight rows - output cols: %d != weight rows : %d",(int) output.GetNcols(),n);
47  R__ASSERT((int) output.GetNcols() == n);
48  }
49  if ((int)Weights.GetNcols() != k) {
50  Error("MultiplyTranspose","Invalid input cols or weight cols - input cols: %d != weight cols : %d", k, (int) Weights.GetNcols());
51  R__ASSERT((int) Weights.GetNcols() == k);
52  }
53 
54 #ifdef R__HAS_TMVACPU
55 
56  char transa = 'N';
57  char transb = 'T';
58 
59  AFloat alpha = 1.0;
60  AFloat beta = 0.0;
61 
62  const AFloat *A = input.GetRawDataPointer();
63  const AFloat *B = Weights.GetRawDataPointer();
64  AFloat *C = output.GetRawDataPointer();
65 
66  ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, A, &m, B, &n, &beta, C, &m);
67 #else
68  TMatrixT<AFloat> tmp(output.GetNrows(), output.GetNcols());
69  tmp.MultT( input,Weights);
70  output = tmp;
71 #endif
72 }
73 
74 template <typename AFloat>
75 void TCpu<AFloat>::AddRowWise(TCpuMatrix<AFloat> &output, const TCpuMatrix<AFloat> &biases)
76 {
77 #ifdef R__HAS_TMVACPU
78  int m = (int)output.GetNrows();
79  int n = (int)output.GetNcols();
80 
81  int inc = 1.0;
82  AFloat alpha = 1.0;
83 
84  AFloat *A = output.GetRawDataPointer();
85  const AFloat *x = TCpuMatrix<AFloat>::GetOnePointer();
86  const AFloat *y = biases.GetRawDataPointer();
87 
88  R__ASSERT(m <= (int)TCpuMatrix<AFloat>::GetOnePointerSize());
89  R__ASSERT(n <= (int)(biases.GetNcols()*biases.GetNrows()));
90 
91  ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
92 #else
93  TMatrixT<AFloat> tmp;
94  TReference<AFloat>::AddRowWise(tmp, biases);
95  output = tmp;
96 #endif
97 }
98 
99 template <typename AFloat>
100 void TCpu<AFloat>::Backward(TCpuTensor<AFloat> &activationGradientsBackward, TCpuMatrix<AFloat> &weightGradients,
101  TCpuMatrix<AFloat> &biasGradients, const TCpuTensor<AFloat> &df,
102  const TCpuTensor<AFloat> &/*activationGradients*/, const TCpuMatrix<AFloat> &weights,
103  const TCpuTensor<AFloat> &activationsBackward)
104 {
105  // Compute element-wise product.
106  //Hadamard(df, activationGradients);
107 
108  Matrix_t df_m = df.GetMatrix();
109 
110  // Activation gradients (exclude if it is first layer)
111  if (activationGradientsBackward.GetSize() > 0 ) {
112 
113  Matrix_t activationGradientsBackward_m = activationGradientsBackward.GetMatrix();
114 
115  Multiply(activationGradientsBackward_m, df_m, weights);
116  }
117 
118  // Weight gradients.
119  if (weightGradients.GetNoElements() > 0) TransposeMultiply(weightGradients, df_m, activationsBackward.GetMatrix());
120 
121  // PrintTensor(activationsBackward,"activ backward");
122  //PrintTensor(Tensor_t(weightGradients),"weight gradients");
123 
124  // Bias gradients.
125  if (biasGradients.GetNoElements() > 0) SumColumns(biasGradients, df_m);
126 }
127 
128 
129 
130 //____________________________________________________________________________
131 template <typename AFloat>
132 void TCpu<AFloat>::Im2col(TCpuMatrix<AFloat> &A, const TCpuMatrix<AFloat> &B, size_t imgHeight, size_t imgWidth,
133  size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
134  size_t zeroPaddingHeight, size_t zeroPaddingWidth)
135 {
136 
137  // image boudaries
138  int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
139  int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
140  size_t currLocalView = 0;
141 
142  const int halfFltHeight = fltHeight / 2;
143  const int halfFltWidth = fltWidth / 2;
144  const int halfFltHeightM1 = (fltHeight - 1) / 2;
145  const int halfFltWidthM1 = (fltWidth - 1) / 2;
146  const int nRowsInput = B.GetNrows();
147  const int nColsInput = B.GetNcols();
148  const int nRowsOutput = A.GetNrows();
149  const int nColsOutput = A.GetNcols();
150 
151  // convolution centers
152  for (int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
153  for (int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
154  size_t currLocalViewPixel = 0;
155 
156  // within the local view
157  R__ASSERT((int) currLocalView < nRowsOutput );
158 
159  for (int m = 0; m < nRowsInput; m++) {
160  for (int k = i - halfFltHeight ; k <= Int_t(i + halfFltHeightM1 ); k++) {
161  int kstep = k * imgWidth;
162  for (int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
163 
164  // Check the boundaries
165  R__ASSERT((int) currLocalViewPixel < nColsOutput );
166  //R__ASSERT(k * imgWidth + l < B.GetNcols());
167  if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >= nColsInput)
168  A(currLocalView, currLocalViewPixel++) = 0;
169  else
170  A(currLocalView, currLocalViewPixel++) = B(m, kstep + l);
171  }
172  }
173  }
174  //std::cout << " i " << i << " " << j << " increment currLocalView " << currLocalView << std::endl;
175  currLocalView++;
176  }
177  }
178  //TMVA_DNN_PrintTCpuMatrix(A,"FromIm2Col");
179 }
180 
181 //____________________________________________________________________________
182 template <typename AFloat>
183 void TCpu<AFloat>::Im2colIndices(std::vector<int> &V, const TCpuMatrix<AFloat> &B, size_t nLocalViews, size_t imgHeight, size_t imgWidth,
184  size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
185  size_t zeroPaddingHeight, size_t zeroPaddingWidth)
186 {
187 
188  // image boudaries
189  int imgHeightBound = imgHeight + zeroPaddingHeight - (fltHeight - 1) / 2 - 1;
190  int imgWidthBound = imgWidth + zeroPaddingWidth - (fltWidth - 1) / 2 - 1;
191  size_t currLocalView = 0;
192 
193  const int halfFltHeight = fltHeight / 2;
194  const int halfFltWidth = fltWidth / 2;
195  const int halfFltHeightM1 = (fltHeight - 1) / 2;
196  const int halfFltWidthM1 = (fltWidth - 1) / 2;
197  const int nRowsInput = B.GetNrows();
198  const int nColsInput = B.GetNcols();
199  const size_t nSizeOutput = V.size();
200  const int npixels = nRowsInput * fltHeight * fltWidth;
201  // const int nRowsOutput = A.GetNrows();
202  // const int nColsOutput = A.GetNcols();
203 
204  // convolution centers
205  for (int i = halfFltHeight -zeroPaddingHeight; i <= imgHeightBound; i += strideRows) {
206  for (int j = halfFltWidth -zeroPaddingWidth ; j <= imgWidthBound; j += strideCols) {
207  size_t currLocalViewPixel = 0;
208 
209  // within the local view
210  //R__ASSERT((int) currLocalView < nRowsOutput );
211 
212  for (int m = 0; m < nRowsInput; m++) {
213  for (int k = i - halfFltHeight ; k <= Int_t(i + halfFltHeightM1 ); k++) {
214  int kstep = k * imgWidth;
215  for (int l = j - halfFltWidth ; l <= Int_t(j + halfFltWidthM1); l++) {
216 
217  // Check the boundaries
218  //R__ASSERT(currLocalViewPixel < nColsOutput );
219  R__ASSERT(currLocalView * npixels + currLocalViewPixel < nSizeOutput );
220  if (k < 0 || k >= (Int_t)imgHeight || l < 0 || l >= (Int_t)imgWidth || kstep + l >= nColsInput)
221  //V[currLocalView * npixels + currLocalViewPixel]=-1;
222  V[currLocalViewPixel * nLocalViews + currLocalView] = -1;
223  else
224  V[currLocalViewPixel * nLocalViews + currLocalView]= ( kstep + l) * nRowsInput + m;
225 
226  currLocalViewPixel++;
227  }
228  }
229  }
230  currLocalView++;
231  }
232  }
233 }
234 template <typename AFloat>
235 void TCpu<AFloat>::Im2colFast(TCpuMatrix<AFloat> &A, const TCpuMatrix<AFloat> &B, const std::vector<int> &V)
236 {
237  size_t n = V.size();
238  R__ASSERT( n == A.GetNcols() * A.GetNrows() );
239  AFloat * a = A.GetRawDataPointer();
240  const AFloat * b = B.GetRawDataPointer();
241 
242 //#define DL_USE_MTE
243  // parallel execution
244 #ifdef DL_USE_MTE
245  const size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(n);
246 
247  auto f = [&](UInt_t workerID)
248  {
249  for (size_t j = 0; j < nsteps; ++j) {
250  size_t ii = workerID+j;
251  if (ii >= n) break;
252  int idx = V[ii];
253  if (idx >= 0) a[ii] = b[idx];
254  else a[ii] = 0;
255  }
256  return 0;
257  };
258 
259  A.GetThreadExecutor().Foreach(f, ROOT::TSeqI(0,n,nsteps) );
260 
261 #else
262  //serial execution
263  for (size_t ii = 0; ii < n; ++ii) {
264  int idx = V[ii];
265  if (idx >= 0) a[ii] = b[idx];
266  else a[ii] = 0;
267  }
268 
269 #endif
270 }
271 //____________________________________________________________________________
272 template <typename AFloat>
273 void TCpu<AFloat>::RotateWeights(TCpuMatrix<AFloat> &A, const TCpuMatrix<AFloat> &B, size_t filterDepth,
274  size_t filterHeight, size_t filterWidth, size_t numFilters)
275 {
276  size_t jump = filterHeight * filterWidth;
277  for (size_t j = 0; j < filterDepth; j++) {
278  for (size_t k = 0; k < numFilters; k++) {
279  for (size_t i = 0; i < jump; i++) {
280  A(j, k * jump + i) = B(k, ((j + 1) * jump - 1) - i);
281  //A(j, k * jump + i) = B(k, j * jump + i);
282  }
283  }
284  }
285 }
286 
287 //____________________________________________________________________________
288 template <typename AFloat>
289 void TCpu<AFloat>::AddConvBiases(TCpuMatrix<AFloat> &output, const TCpuMatrix<AFloat> &biases)
290 {
291 #ifdef R__HAS_TMVACPU
292  int m = (int)output.GetNrows();
293  int n = (int)output.GetNcols();
294 
295  int inc = 1.0;
296  AFloat alpha = 1.0;
297 
298  AFloat *A = output.GetRawDataPointer();
299  const AFloat *x = biases.GetRawDataPointer();
300  const AFloat *y = TCpuMatrix<AFloat>::GetOnePointer();
301 
302  R__ASSERT(m <= (int)biases.GetNoElements() );
303  R__ASSERT(n <= (int)TCpuMatrix<AFloat>::GetOnePointerSize() );
304 
305  ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
306 #else
307  TMatrixT<AFloat> tmp;
308  TReference<AFloat>::AddConvBiases(tmp, biases);
309  output = tmp;
310 #endif
311 }
312 
313 template<typename AFloat>
314 size_t TCpu<AFloat>::calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride)
315 {
316  size_t temp = imgDim - fltDim + 2 * padding;
317  if (temp % stride || temp + stride <= 0) {
318  Fatal("calculateDimension", "Not compatible hyper parameters for layer - (imageDim, filterDim, padding, stride) "
319  "%zu, %zu, %zu, %zu", imgDim, fltDim, padding, stride);
320  }
321  return temp / stride + 1;
322 }
323 
324 //____________________________________________________________________________
325 template <typename AFloat>
326 void TCpu<AFloat>::ConvLayerForward(TCpuTensor<AFloat> & output,
327  TCpuTensor<AFloat> & inputActivationFunc,
328  const TCpuTensor<AFloat> &input,
329  const TCpuMatrix<AFloat> &weights, const TCpuMatrix<AFloat> & biases,
330  const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
331  TCpuTensor<AFloat> & /* */,
332  const ConvDescriptors_t & /*descriptors*/,
333  ConvWorkspace_t & /*workspace*/)
334 {
335  size_t height = calculateDimension(params.inputHeight, params.filterHeight, params.paddingHeight, params.strideRows);
336  size_t width = calculateDimension(params.inputWidth, params.filterWidth, params.paddingWidth, params.strideCols);
337  size_t nLocalViews = height * width;
338  size_t nLocalViewPixels = params.inputDepth * params.filterHeight * params.filterWidth;
339 
340  R__ASSERT( input.GetSize() > 0);
341  std::vector<int> forwardIndices(nLocalViews * nLocalViewPixels);
342  Im2colIndices(forwardIndices, input.At(0).GetMatrix(), nLocalViews, params.inputHeight, params.inputWidth, params.filterHeight,
343  params.filterWidth, params.strideRows, params.strideCols, params.paddingHeight, params.paddingWidth);
344 
345  //this should fix multi-thread inizializations of arrays
346  TCpuMatrix<AFloat>::InitializeOneVector(nLocalViews);
347  TCpuMatrix<AFloat>::InitializeOneVector(output.GetWSize()); // since it is used in AddCOnvBiases
348 
349 
350  auto f = [&] (UInt_t i)
351  {
352  // dropout not yet implemented for CNN
353  // if (applyDropout && (dropoutProbability != 1.0)) {
354  // Dropout(input[i], dropoutProbability);
355  // }
356 
357  TCpuMatrix<AFloat> inputTr(nLocalViews, nLocalViewPixels);
358  //inputTr.Zero(); // this is not thread safe
359 
360  Im2colFast(inputTr, input.At(i).GetMatrix(), forwardIndices);
361 
362  Matrix_t output_m = output.At(i).GetMatrix();
363  MultiplyTranspose(output_m, weights, inputTr);
364  AddConvBiases(output_m, biases);
365 
366  };
367 
368  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(input.GetFirstSize()));
369 
370  //evaluateDerivative<TCpu<AFloat>>(derivatives, activFunc, output);
371  // need to save output of convolution (input to activation function)
372  Copy(inputActivationFunc, output);
373 
374  //evaluate<TCpu<AFloat>>(output, activFunc);
375  ActivationFunctionForward(output, activFunc, ActivationDescriptor_t());
376 }
377 
378 //____________________________________________________________________________
379 template <typename AFloat>
380 void TCpu<AFloat>::ConvLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
381  TCpuMatrix<AFloat> &weightGradients, TCpuMatrix<AFloat> &biasGradients,
382  TCpuTensor<AFloat> &inputActivationFunc,
383  TCpuTensor<AFloat> &activationGradients,
384  const TCpuMatrix<AFloat> &weights,
385  const TCpuTensor<AFloat> &activationsBackward,
386  const Tensor_t & outputTensor,
387  EActivationFunction activFunc,
388  const ConvDescriptors_t & /*descriptors*/,
389  ConvWorkspace_t & /*workspace*/,
390  size_t batchSize, size_t inputHeight,
391  size_t inputWidth, size_t depth,
392  size_t height, size_t width,
393  size_t filterDepth, size_t filterHeight,
394  size_t filterWidth, size_t nLocalViews)
395 {
396  // Update derivatives
397  // size_t m, n;
398  // m = activationGradients[0].GetNrows();
399  // n = activationGradients[0].GetNcols();
400 
401 
402  // Compute activation backward pass dx = f'(x) * dy
403  // put resulting dx of activation in activationgradients
404  Tensor_t df(activationGradients.GetShape() ); // this is a deep copy, could be put as data member of class
405  ActivationFunctionBackward(df, outputTensor, activationGradients, inputActivationFunc,
406  activFunc, ActivationDescriptor_t() );
407 
408  // Hadamard(df, activationGradients);
409 
410 
411  // Calculate the activation gradients of the previous layer
412  CalculateConvActivationGradients(activationGradientsBackward, df, weights, batchSize, inputHeight, inputWidth, depth,
413  height, width, filterDepth, filterHeight, filterWidth);
414 
415  // Calculate the weight gradients
416  CalculateConvWeightGradients(weightGradients, df, activationsBackward, batchSize, inputHeight, inputWidth, depth,
417  height, width, filterDepth, filterHeight, filterWidth, nLocalViews);
418 
419  // Calculate the bias gradients
420  CalculateConvBiasGradients(biasGradients, df, batchSize, depth, nLocalViews);
421 }
422 
423 //____________________________________________________________________________
424 template <typename AFloat>
425 void TCpu<AFloat>::CalculateConvActivationGradients(TCpuTensor<AFloat> &activationGradientsBackward,
426  const TCpuTensor<AFloat> &df,
427  const TCpuMatrix<AFloat> &weights, size_t batchSize,
428  size_t inputHeight, size_t inputWidth, size_t depth, size_t height,
429  size_t width, size_t filterDepth, size_t filterHeight,
430  size_t filterWidth)
431 {
432  if (activationGradientsBackward.GetSize() == 0) return;
433 
434 
435  activationGradientsBackward.Zero();
436 
437 
438  // Transform the weights
439 
440  //TMVA_DNN_PrintTCpuMatrix(weights,"weights");
441  // filter depth must be same as input depth
442  TCpuMatrix<AFloat> rotWeights(filterDepth, depth * filterHeight * filterWidth);
443  RotateWeights(rotWeights, weights, filterDepth, filterHeight, filterWidth, weights.GetNrows());
444  //TMVA_DNN_PrintTCpuMatrix(rotWeights,"rot-weights");
445 
446  // Calculate the zero paddings
447  size_t tempZeroPaddingHeight = (size_t)(floor((inputHeight - height + filterHeight - 1) / 2));
448  size_t tempZeroPaddingWidth = (size_t)(floor((inputWidth - width + filterWidth - 1) / 2));
449 
450  // size_t tempZeroPaddingHeight = 1;
451  // size_t tempZeroPaddingWidth = 1;
452 
453  // Calculate the number of local views and the number of pixles in each view
454  size_t tempNLocalViews = inputHeight * inputWidth;
455  size_t tempNLocalViewPixels = depth * filterHeight * filterWidth;
456 
457  size_t tempStrideRows = 1;
458  size_t tempStrideCols = 1;
459 
460  // An entire convolution follows
461 
462  std::vector<int> vIndices( tempNLocalViews * tempNLocalViewPixels );
463  Im2colIndices(vIndices, df.At(0).GetMatrix(), tempNLocalViews, height, width, filterHeight, filterWidth, tempStrideRows, tempStrideCols,
464  tempZeroPaddingHeight, tempZeroPaddingWidth);
465 
466 
467  //for (size_t i = 0; i < batchSize; i++) {
468  R__ASSERT(batchSize == df.GetFirstSize() );
469  R__ASSERT(batchSize == activationGradientsBackward.GetFirstSize() );
470  auto f = [&] (UInt_t i)
471  {
472  // Im2col(dfTr, df[i], height, width, filterHeight, filterWidth, tempStrideRows, tempStrideCols,
473  // tempZeroPaddingHeight, tempZeroPaddingWidth);
474 
475  TCpuMatrix<AFloat> dfTr(tempNLocalViews, tempNLocalViewPixels);
476 
477  Im2colFast(dfTr, df.At(i).GetMatrix(), vIndices);
478 
479  //TMVA_DNN_PrintTCpuMatrix(df[i],"df[i]");
480  //TMVA_DNN_PrintTCpuMatrix(dfTr,"dfTr");
481 
482  Matrix_t agb_m = activationGradientsBackward.At(i).GetMatrix();
483  MultiplyTranspose(agb_m, rotWeights, dfTr);
484 
485  //TMVA_DNN_PrintTCpuMatrix(activationGradientsBackward[i],"activGrad-result");
486 
487  };
488 
489  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI( batchSize ) );
490 }
491 
492 //____________________________________________________________________________
493 template <typename AFloat>
494 void TCpu<AFloat>::CalculateConvWeightGradients(TCpuMatrix<AFloat> &weightGradients,
495  const TCpuTensor<AFloat> &df,
496  const TCpuTensor<AFloat> &activationsBackward,
497  size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth,
498  size_t height, size_t width, size_t filterDepth, size_t filterHeight,
499  size_t filterWidth, size_t nLocalViews)
500 {
501  // reinitialize the weight gradients to 0
502  weightGradients.Zero();
503 
504  const size_t filterSize = filterHeight * filterWidth;
505  const size_t nLocalViewPixels = filterDepth * filterHeight * filterWidth;
506  R__ASSERT( weightGradients.GetNcols() == filterDepth * filterHeight * filterWidth);
507 
508  const size_t tempStrideRows = 1;
509  const size_t tempStrideCols = 1;
510 
511  // Calculate the zero paddings from the input height and width (assume stride =1 )
512  const size_t tempZeroPaddingHeight = (height - inputHeight + filterHeight - 1) / 2;
513  const size_t tempZeroPaddingWidth = (width - inputWidth + filterWidth - 1) / 2;
514 
515 
516  // convolution
517 
518 
519 
520  std::vector<int> vIndices(nLocalViews * nLocalViewPixels );
521  Im2colIndices(vIndices, activationsBackward.At(0).GetMatrix(), nLocalViews, inputHeight, inputWidth, filterHeight , filterWidth,
522  tempStrideRows, tempStrideCols, tempZeroPaddingHeight, tempZeroPaddingWidth);
523 
524  //std::cout << "do back-propagation in conv layer - compute weight gradient" << std::endl;
525 
526  // std::vector< TCpuMatrix<AFloat> > vres;//(batchSize);
527  // for (size_t i = 0; i < batchSize; i++) {
528  // vres.emplace_back(depth, nLocalViewPixels);
529  // //TMVA_DNN_PrintTCpuMatrix(df[i],"df");
530  // //TMVA_DNN_PrintTCpuMatrix(activationsBackward[i],"df");
531 
532  //}
533  //TCpuTensor<AFloat> vres( { batchSize, depth, nLocalViewPIxels} );
534  TCpuTensor<AFloat> vres( batchSize, depth, nLocalViewPixels);
535 
536  auto fmap = [&](int i) {
537 
538  //TMVA_DNN_PrintTCpuMatrix(df[i],"df-i");
539  TCpuMatrix<AFloat> xTr(nLocalViews, nLocalViewPixels);
540  TCpuMatrix<AFloat> res(depth, nLocalViewPixels);
541 
542  //computing t he gradient is equivalent of doing a convolution of the input using as conv kernel the delta's (the df[] values)
543  //N.B. only stride values=1 are now supported
544 
545  //xTr.Zero();
546  // Im2col(xTr, const_cast<TCpuMatrix<AFloat> &>(activationsBackward[i]), inputHeight, inputWidth, filterHeight , filterWidth,
547  // tempStrideRows, tempStrideCols, tempZeroPaddingHeight, tempZeroPaddingWidth);
548  Im2colFast(xTr, activationsBackward.At(i).GetMatrix(), vIndices);
549 
550  //std::cout << "doing im2colfast" << std::endl;
551  //TMVA_DNN_PrintTCpuMatrix(xTr,"xTr-i");
552  //TMVA_DNN_PrintTCpuMatrix(activationsBackward[i],"actbackward-i");
553  Matrix_t mres = vres.At(i).GetMatrix();
554  Multiply( mres, df.At(i).GetMatrix(), xTr);
555  //TMVA_DNN_PrintTCpuMatrix(vres[i],"res_ofMT");
556 
557  return;
558  //return res;
559  };
560 
561  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(fmap, ROOT::TSeqI( batchSize ) );
562 
563 // auto freduce = [&](const TCpuTensor<AFloat> & vres) {
564  R__ASSERT(vres.GetFirstSize() == batchSize);
565  for (size_t i = 0; i < batchSize; i++) {
566  //TMVA_DNN_PrintTCpuMatrix(vres[i],"res");
567  Matrix_t vres_m = vres.At(i).GetMatrix();
568  for (size_t j = 0; j < depth; j++) {
569  for (size_t k = 0; k < filterDepth; k++) {
570  size_t kOffset = k * filterSize;
571  for (size_t l = 0; l < filterSize; l++) {
572  //weightGradients(j, k * (filterHeight * filterWidth) + l) += res(k, (tempNLocalViews - 1) - l);
573  weightGradients(j, kOffset + l) += vres_m(j, kOffset + l);
574  }
575  }
576  }
577  // TMVA_DNN_PrintTCpuMatrix(weightGradients,"weights_i");
578  }
579  // };
580 
581  //TCpuMatrix<AFloat>::GetThreadExecutor().MapReduce(fmap, ROOT::TSeqI( batchSize ) , freduce);
582  //TMVA_DNN_PrintTCpuMatrix(weightGradients,"W-Grad");
583 }
584 
585 //____________________________________________________________________________
586 template <typename AFloat>
587 void TCpu<AFloat>::CalculateConvBiasGradients(TCpuMatrix<AFloat> &biasGradients, const TCpuTensor<AFloat> &df,
588  size_t batchSize, size_t depth, size_t nLocalViews)
589 {
590  biasGradients.Zero();
591  for (size_t i = 0; i < depth; i++) {
592  AFloat sum = 0;
593  for (size_t j = 0; j < nLocalViews; j++) {
594  for (size_t k = 0; k < batchSize; k++) {
595  sum += df(k,i,j);
596  //sum += df[k](i, j);
597  }
598  }
599  biasGradients(i, 0) = sum;
600  }
601 }
602 
603 //____________________________________________________________________________
604 template <typename AFloat>
605 void TCpu<AFloat>::Downsample(TCpuTensor<AFloat> &tA, TCpuTensor<AFloat> &tB, const TCpuTensor<AFloat> &tC,
606  const PoolingDescriptors_t & /*descriptors*/,
607  PoolingWorkspace_t & /*workspace*/,
608  size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows,
609  size_t strideCols)
610 {
611  // A is output , B is a cached index tensor used for backward pass and C is the input
612 
613  assert( tA.GetFirstSize() == tC.GetFirstSize());
614  for (size_t ifirst = 0; ifirst < tC.GetFirstSize(); ++ifirst) {
615 
616  Matrix_t A = tA.At(ifirst).GetMatrix();
617  Matrix_t B = tB.At(ifirst).GetMatrix();
618  Matrix_t C = tC.At(ifirst).GetMatrix();
619 
620  // image boudaries
621  int imgHeightBound = imgHeight - (fltHeight - 1) / 2 - 1;
622  int imgWidthBound = imgWidth - (fltWidth - 1) / 2 - 1;
623  size_t currLocalView = 0;
624 
625  // centers
626  for (int i = fltHeight / 2; i <= imgHeightBound; i += strideRows) {
627  for (int j = fltWidth / 2; j <= imgWidthBound; j += strideCols) {
628  // within local views
629  for (int m = 0; m < (Int_t)C.GetNrows(); m++) {
630  AFloat value = -std::numeric_limits<AFloat>::max();
631 
632  for (int k = i - fltHeight / 2; k <= Int_t(i + (fltHeight - 1) / 2); k++) {
633  for (int l = j - fltWidth / 2; l <= Int_t(j + (fltWidth - 1) / 2); l++) {
634  if (C(m, k * imgWidth + l) > value) {
635  value = C(m, k * imgWidth + l);
636  B(m, currLocalView) = k * imgWidth + l;
637  }
638  }
639  }
640  A(m, currLocalView) = value;
641  }
642  currLocalView++;
643  }
644  }
645  }
646 }
647 
648 //____________________________________________________________________________
649 template <typename AFloat>
650 void TCpu<AFloat>::MaxPoolLayerBackward(TCpuTensor<AFloat> &activationGradientsBackward,
651  const TCpuTensor<AFloat> &activationGradients,
652  const TCpuTensor<AFloat> &indexMatrix,
653  const TCpuTensor<AFloat> & /*inputActivation*/,
654  const TCpuTensor<AFloat> & /*outputTensor*/,
655  const PoolingDescriptors_t & /*descriptors*/,
656  PoolingWorkspace_t & /*workspace*/,
657  size_t /* imgHeight */,
658  size_t /* imgWidth */,
659  size_t /* fltHeight */,
660  size_t /* fltWidth */,
661  size_t /* strideRows */,
662  size_t /* strideCols */,
663  size_t nLocalViews)
664 {
665 
666  assert( activationGradientsBackward.GetFirstSize() == activationGradients.GetFirstSize());
667  for (size_t l = 0; l < activationGradients.GetFirstSize(); ++l) {
668 
669  Matrix_t activationGradientsBackward_m = activationGradientsBackward.At(l).GetMatrix();
670  Matrix_t activationGradients_m = activationGradients.At(l).GetMatrix();
671  Matrix_t indexMatrix_m = indexMatrix.At(l).GetMatrix();
672 
673  size_t depth = activationGradientsBackward_m.GetNrows();
674 
675  for (size_t j = 0; j < depth; j++) {
676  // initialize to zeros
677  for (size_t t = 0; t < (size_t)activationGradientsBackward_m.GetNcols(); t++) {
678  activationGradientsBackward_m(j, t) = 0;
679  }
680 
681  // set values
682  for (size_t k = 0; k < nLocalViews; k++) {
683  AFloat grad = activationGradients_m(j, k);
684  size_t winningIdx = indexMatrix_m(j, k);
685  activationGradientsBackward_m(j, winningIdx) += grad;
686  }
687  }
688  }
689 }
690 
691 //____________________________________________________________________________
692 template <typename AFloat>
693 TCpuTensor<AFloat> TCpu<AFloat>::BatchNormLayerReshapeTensor(int axis, const TCpuTensor<AFloat> &x) {
694  // reshape tensor for batch norm layer according to normalization axis
695  // input x - output reshpe of X
696  if (axis == 1) {
697  // reshape to a RowMajor tensor so I can use same indices, in this case channel
698  typename TCpuTensor<AFloat>::Shape_t newShape = { x.GetSize() / x.GetShape().front() , x.GetShape().front() }; // shape is HXWXB , C
699  TCpuTensor<AFloat> xtmp(x.GetDeviceBuffer(), newShape, TCpuTensor<AFloat>::MemoryLayout::RowMajor);
700  return xtmp;
701  }
702  // dense layer case (axis == -1)
703  return x.Reshape( { x.GetShape().front(), x.GetSize()/ x.GetShape().front()});
704  // what to do with time layer ?
705 }
706 
707 //____________________________________________________________________________
708 template <typename AFloat>
709 void TCpu<AFloat>::BatchNormLayerForwardTraining(int axis, const TCpuTensor<AFloat> &x,
710  TCpuTensor<AFloat> & y,
711  Matrix_t &gamma, Matrix_t &beta,
712  Matrix_t & mean,
713  Matrix_t & variance,
714  Matrix_t & iVariance,
715  Matrix_t & runningMeans,
716  Matrix_t & runningVars,
717  Scalar_t nTrainedBatches,
718  Scalar_t momentum, Scalar_t epsilon,
719  const TensorDescriptor_t & )
720  //BNormWorkspace_t * workspace )
721 {
722  // the tensor are reshaped in order to
723  // have a first coordinate the normalized coordinates and second the feature we are computing the norm
724  // e.g. batch size , number of features or
725  // B x H x W , C
726 
727  TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
728  TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
729 
730  assert (input.GetShape().size() == 2);
731  size_t n = input.GetShape()[0]; // size of coordinates we are normalizing (e.g batch size)
732  size_t d = input.GetShape()[1]; // size of the coordinate we are not normalizing (e.g. feature size)
733 
734  TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
735  TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
736 
737 
738  // lambda implementing computation for each single component k we need to normalize
739  auto f = [&] (size_t k)
740  {
741 
742  auto inputK = inputBuffer.GetSubBuffer(k * n, n);
743  auto outputK = outputBuffer.GetSubBuffer(k * n, n);
744 
745  double meanK = 0;
746  meanK = 0;
747  for (size_t i = 0; i < n; i++) {
748  AFloat xi = inputK[i];
749  meanK += xi;
750  }
751  meanK = meanK/ n;
752 
753  double sq = 0;
754  for (size_t i = 0; i < n; i++) {
755  AFloat xi = inputK[i];
756  double xmu = xi - meanK;
757  sq = sq + (xmu * xmu);
758  outputK[i] = AFloat(xmu);
759  }
760  mean(0,k) = meanK;
761  variance(0,k) = sq / n;
762  iVariance(0,k) = 1. / std::sqrt(variance(0,k) + epsilon);
763 
764  double iVK = iVariance(0, k);
765  double gK = gamma(0, k);
766  double bK = beta(0, k);
767  for (size_t i = 0; i < n; i++) {
768  AFloat yi = outputK[i] ;
769  outputK[i] = AFloat( gK * iVK * yi + bK );
770  }
771 
772 
773  // fVar(0,k) -= epsilon;
774 
775  if (nTrainedBatches == 0) {
776  runningMeans(0,k) = mean(0,k);
777  runningVars(0,k) = variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
778  } else {
779  double decay = momentum;
780  if (momentum < 0) decay = nTrainedBatches/Scalar_t(nTrainedBatches+1);
781  runningMeans(0,k) = decay * runningMeans(0,k) + (1. - decay) * mean(0,k);
782  runningVars(0,k) = decay * runningVars(0,k) + (1.-decay) * variance(0,k) * (n) / (Scalar_t(n - 1) + epsilon);
783  }
784  // std::cout << " training batch " << nTrainedBatches << " estimated mu : " << runningMeans(0, k)
785  // << " estimated var " << runningVars(0,k) << std::endl;
786 
787  }; // end f(k) definition
788 
789  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
790 }
791 
792 //____________________________________________________________________________
793 template <typename AFloat>
794 void TCpu<AFloat>::BatchNormLayerForwardInference(int axis, const TCpuTensor<AFloat> &x,
795  Matrix_t & gamma,
796  Matrix_t & beta,
797  TCpuTensor<AFloat> &y,
798  const Matrix_t & runningMeans,
799  const Matrix_t & runningVars,
800  Scalar_t epsilon,
801  const TensorDescriptor_t & )
802 {
803  TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
804  TCpuTensor<AFloat> output = BatchNormLayerReshapeTensor(axis,y);
805 
806  assert (input.GetShape().size() == 2);
807  size_t n = input.GetShape()[0]; // size of coordinates we are normalizing (e.g batch size)
808  size_t d = input.GetShape()[1];
809 
810  TCpuBuffer<AFloat> &inputBuffer = input.GetDeviceBuffer();
811  TCpuBuffer<AFloat> &outputBuffer = output.GetDeviceBuffer();
812 
813  auto f = [&] (size_t k) {
814 
815  auto inputK = inputBuffer.GetSubBuffer(k * n, n);
816  auto outputK = outputBuffer.GetSubBuffer(k * n, n);
817 
818  double gK = gamma(0, k);
819  double bK = beta(0, k);
820  double mK = runningMeans(0, k);
821  double vK = 1. / (sqrt(runningVars(0, k) + epsilon));
822 
823  // during inference just use stored mu and variance
824  for (size_t i = 0; i < n; i++) {
825  AFloat xi = inputK[i];
826  outputK[i] = AFloat( gK * (xi - mK) * vK + bK );
827  }
828  }; // end definition of f(k)
829 
830  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
831 }
832 
833 //____________________________________________________________________________
834 template <typename AFloat>
835 void TCpu<AFloat>::BatchNormLayerBackward(int axis, const TCpuTensor<AFloat> &x,
836  const TCpuTensor<AFloat> &dy,
837  TCpuTensor<AFloat> &dx,
838  Matrix_t &gamma, // Matrix_t &beta, (not needed)
839  Matrix_t &dgamma, Matrix_t &dbeta,
840  const Matrix_t & mean,
841  const Matrix_t & variance,
842  const Matrix_t & iVariance,
843  Scalar_t epsilon,
844  const TensorDescriptor_t & )
845 {
846  TCpuTensor<AFloat> input = BatchNormLayerReshapeTensor(axis,x);
847  TCpuTensor<AFloat> inputGrad = BatchNormLayerReshapeTensor(axis,dx);
848  TCpuTensor<AFloat> outputGrad = BatchNormLayerReshapeTensor(axis,dy);
849 
850  assert (outputGrad.GetShape().size() == 2);
851  size_t n = outputGrad.GetShape()[0]; // size of coordinates we are normalizing (e.g batch size)
852  size_t d = outputGrad.GetShape()[1];
853 
854  TCpuBuffer<AFloat> & inputBuffer = input.GetDeviceBuffer();
855  TCpuBuffer<AFloat> & dyBuffer = outputGrad.GetDeviceBuffer();
856  TCpuBuffer<AFloat> & dxBuffer = inputGrad.GetDeviceBuffer();
857 
858 
859  // compute first gradients for gamma and beta
860  auto f = [&] (size_t k) {
861  dgamma(0, k) = 0;
862  dbeta(0, k) = 0;
863  auto inputK = inputBuffer.GetSubBuffer(k * n, n);
864  auto outputGradK = dyBuffer.GetSubBuffer(k * n, n);
865  auto inputGradK = dxBuffer.GetSubBuffer(k * n, n);
866  auto meanK = mean(0, k);
867  for (size_t i = 0; i < n; i++) {
868  AFloat xi = inputK[i];
869  double xhat = xi - meanK;
870  dbeta(0, k) += outputGradK[i];
871  dgamma(0, k) += outputGradK[i] * xhat;
872  }
873  double npSumDy = dbeta(0, k);
874  double npSumDyHMu = dgamma(0, k);
875  dgamma(0, k) *= iVariance(0, k);
876 
877  // compute gradients with respect to input
878  double bterm = npSumDyHMu / (variance(0, k) + epsilon);
879  double aterm = (1. / double(n) * gamma(0, k) * iVariance(0, k));
880  for (size_t i = 0; i < n; i++) {
881  AFloat xi = inputK[i];
882  AFloat dyi = outputGradK[i];
883  double xmu = xi - meanK;
884  inputGradK[i] = AFloat( aterm * (n * dyi - npSumDy - xmu * bterm) );
885  }
886  };
887 
888  TCpuMatrix<AFloat>::GetThreadExecutor().Foreach(f, ROOT::TSeqI(d) );
889 }
890 
891 //____________________________________________________________________________
892 template <typename AFloat>
893 void TCpu<AFloat>::Reshape(TCpuMatrix<AFloat> &A, const TCpuMatrix<AFloat> &B)
894 {
895  size_t nColsA = A.GetNcols();
896  size_t nColsB = B.GetNcols();
897 
898  for (size_t i = 0; i < A.GetNrows(); i++) {
899  for (size_t j = 0; j < A.GetNcols(); j++) {
900  size_t nElem = i * nColsA + j;
901  A(i, j) = B(nElem / nColsB, nElem % nColsB);
902  }
903  }
904 }
905 
906 //____________________________________________________________________________
907 template <typename AFloat>
908 void TCpu<AFloat>::Flatten(TCpuTensor<AFloat> &A, const TCpuTensor<AFloat> &B )
909 {
910 
911  //printf ( "input tensor %f \n",B(0,0,0));
912  //std::cout << "Flatten CPU arch " << std::endl;
913 
914  assert( B.GetShape().size() == 3 );
915  assert( A.GetShape().size() == 3 );
916 
917 
918  size_t bsize = B.GetFirstSize();
919  size_t nRows = B.GetHSize();
920  size_t nCols = B.GetWSize();
921 
922  assert ( A.GetFirstSize() == 1);
923  assert ( A.GetHSize() == bsize);
924  assert ( A.GetWSize() == nRows*nCols);
925 
926  for (size_t i = 0; i < bsize; i++) {
927  for (size_t j = 0; j < nRows; j++) {
928  for (size_t k = 0; k < nCols; k++) {
929  A( 0, i, j * nCols + k) = B(i, j, k);
930  }
931  }
932  }
933 
934  // size_t bsize = B.GetFirstSize();
935  // size_t n = B.GetSize()/bsize;
936  // if (B.GetLayout() == TCpuTensor<AFloat>::MemoryLayout::ColumnMajor ) {
937 
938  // }
939  // A = B.Reshape(bsize, n)
940 }
941 
942 //____________________________________________________________________________
943 template <typename AFloat>
944 void TCpu<AFloat>::Deflatten(TCpuTensor<AFloat> &A, const TCpuTensor<AFloat> &B )
945 {
946 
947  assert( B.GetShape().size() == 3 );
948  assert( A.GetShape().size() == 3 );
949 
950  size_t size = A.GetFirstSize();
951  size_t nRows = A.GetHSize();
952  size_t nCols = A.GetWSize();
953 
954  assert ( B.GetFirstSize() == 1);
955  assert ( B.GetHSize() == size);
956  assert ( B.GetWSize() == nRows*nCols);
957  for (size_t i = 0; i < (size_t)size; i++) {
958  for (size_t j = 0; j < (size_t)nRows; j++) {
959  for (size_t k = 0; k < (size_t)nCols; k++) {
960  A(i, j, k) = B(0, i, j * nCols + k);
961  }
962  }
963  }
964 }
965 
966 //______________________________________________________________________________
967 template <typename AFloat>
968 void TCpu<AFloat>::Rearrange(Tensor_t &out, const Tensor_t &in)
969 {
970  // B x T x D out --- T x B x D in*/
971  assert ( out.GetShape().size() == 3 && in.GetShape().size() == 3);
972 
973 
974  size_t B = out.GetFirstSize();
975  size_t T = out.GetCSize(); //1 for row-major
976  size_t D = out.GetWSize(); // 2 for row-major
977  if ((T != in.GetFirstSize()) || (B != in.GetCSize()) || (D != in.GetWSize()) ) {
978  std::cout << "Incompatible Dimensions\n"
979  << in.GetFirstSize() << "x" << in.GetCSize() << "x" << in.GetWSize() << " --> " << B << "x" << T << "x"
980  << D << "\n";
981  assert(false);
982  return;
983  }
984  for (size_t i = 0; i < B; ++i) {
985  for (size_t j = 0; j < T; ++j) {
986  for (size_t k = 0; k < D; ++k) {
987  out( i, j, k ) = in( j, i, k);
988  }
989  }
990  }
991  return;
992 }
993 
994 } // namespace DNN
995 } // namespace TMVA