Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
CudaBuffers.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 07/08/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ////////////////////////////////////////////////////////////////////////
13 // Implementation of device and host buffers for CUDA architectures. //
14 ////////////////////////////////////////////////////////////////////////
15 
16 #include "TMVA/DataSetInfo.h"
17 #include "TMVA/DNN/DataLoader.h"
18 
21 #ifdef R__HAS_CUDNN
23 #endif
25 
26 #include "cuda_runtime.h"
27 #include <algorithm>
28 
29 namespace TMVA {
30 namespace DNN {
31 
32 //
33 // TCudaHostBuffer
34 //______________________________________________________________________________
35 template <typename AFloat>
36 void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
37 {
38  cudaFreeHost(*devicePointer);
39  delete[] devicePointer;
40 }
41 
42 //______________________________________________________________________________
43 template <typename AFloat>
44 TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size) : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
45 {
46  AFloat **pointer = new AFloat *[1];
47  cudaMallocHost(pointer, size * sizeof(AFloat));
48  fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
49 }
50 
51 //______________________________________________________________________________
52 template <typename AFloat>
53 TCudaHostBuffer<AFloat>::operator AFloat *() const
54 {
55  return *fHostPointer + fOffset;
56 }
57 
58 //______________________________________________________________________________
59 template <typename AFloat>
60 TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset, size_t size)
61 {
62  TCudaHostBuffer buffer = *this;
63  buffer.fOffset = offset;
64  buffer.fSize = size;
65  return buffer;
66 }
67 
68 //______________________________________________________________________________
69 template <typename AFloat>
70 void TCudaHostBuffer<AFloat>::SetConstVal(const AFloat constVal)
71 {
72  std::fill(*fHostPointer, *fHostPointer+fSize, constVal);
73 }
74 
75 //
76 // TCudaDevicePointer
77 //______________________________________________________________________________
78 template <typename AFloat>
79 void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
80 {
81  cudaFree(*devicePointer);
82  delete[] devicePointer;
83 }
84 
85 //______________________________________________________________________________
86 template <typename AFloat>
87 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size) : fOffset(0), fSize(size), fDestructor()
88 {
89  AFloat **pointer = new AFloat *[1];
90  cudaMalloc(pointer, size * sizeof(AFloat));
91  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
92  cudaStreamCreate(&fComputeStream);
93 }
94 
95 //______________________________________________________________________________
96 template <typename AFloat>
97 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, cudaStream_t stream)
98  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
99 {
100  AFloat **pointer = new AFloat *[1];
101  cudaMalloc(pointer, size * sizeof(AFloat));
102  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
103 }
104 
105 //______________________________________________________________________________
106 template <typename AFloat>
107 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, size_t size, cudaStream_t stream)
108  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
109 {
110  AFloat **pointer = new AFloat *[1];
111  *pointer = devicePointer;
112  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
113 }
114 
115 //______________________________________________________________________________
116 template <typename AFloat>
117 TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(size_t offset, size_t size)
118 {
119  TCudaDeviceBuffer buffer = *this;
120  buffer.fOffset = offset;
121  buffer.fSize = size;
122  return buffer;
123 }
124 
125 //______________________________________________________________________________
126 template <typename AFloat>
127 TCudaDeviceBuffer<AFloat>::operator AFloat *() const
128 {
129  return *fDevicePointer + fOffset;
130 }
131 
132 //______________________________________________________________________________
133 template <typename AFloat>
134 void TCudaDeviceBuffer<AFloat>::CopyFrom(const TCudaHostBuffer<AFloat> &buffer) const
135 {
136  cudaStreamSynchronize(fComputeStream);
137  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
138 }
139 
140 //______________________________________________________________________________
141 template <typename AFloat>
142 void TCudaDeviceBuffer<AFloat>::CopyTo(const TCudaHostBuffer<AFloat> &buffer) const
143 {
144  cudaMemcpyAsync(buffer, *this, fSize * sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
145  buffer.fComputeStream = fComputeStream;
146 }
147 
148 //______________________________________________________________________________
149 template <>
150 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
151  size_t batchSize)
152 {
153  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
154  size_t n = inputMatrix.GetNcols();
155 
156  for (size_t i = 0; i < batchSize; i++) {
157  size_t sampleIndex = *sampleIterator;
158  for (size_t j = 0; j < n; j++) {
159  size_t bufferIndex = j * batchSize + i;
160  buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
161  }
162  sampleIterator++;
163  }
164 }
165 
166 //______________________________________________________________________________
167 template <>
168 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer,
169  IndexIterator_t sampleIterator, size_t batchSize)
170 {
171  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
172  size_t n = outputMatrix.GetNcols();
173 
174  for (size_t i = 0; i < batchSize; i++) {
175  size_t sampleIndex = *sampleIterator;
176  for (size_t j = 0; j < n; j++) {
177  size_t bufferIndex = j * batchSize + i;
178  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
179  }
180  sampleIterator++;
181  }
182 }
183 
184 //______________________________________________________________________________
185 template <>
186 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer,
187  IndexIterator_t sampleIterator, size_t batchSize)
188 {
189  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
190  for (size_t i = 0; i < batchSize; i++) {
191  buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
192  sampleIterator++;
193  }
194 }
195 
196 //______________________________________________________________________________
197 template <>
198 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
199  size_t batchSize)
200 {
201  Event *event = std::get<0>(fData)[0];
202  size_t n = event->GetNVariables();
203  for (size_t i = 0; i < batchSize; i++) {
204  size_t sampleIndex = * sampleIterator++;
205  event = std::get<0>(fData)[sampleIndex];
206  for (size_t j = 0; j < n; j++) {
207  size_t bufferIndex = j * batchSize + i;
208  buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
209  }
210  }
211 }
212 
213 //______________________________________________________________________________
214 template <>
215 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
216  size_t batchSize)
217 {
218  const DataSetInfo &info = std::get<1>(fData);
219  size_t n = buffer.GetSize() / batchSize;
220 
221  // Copy target(s).
222 
223  for (size_t i = 0; i < batchSize; i++) {
224  size_t sampleIndex = *sampleIterator++;
225  Event *event = std::get<0>(fData)[sampleIndex];
226  for (size_t j = 0; j < n; j++) {
227  // Copy output matrices.
228  size_t bufferIndex = j * batchSize + i;
229  // Classification
230  if (event->GetNTargets() == 0) {
231  if (n == 1) {
232  // Binary.
233  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
234  } else {
235  // Multiclass.
236  buffer[bufferIndex] = 0.0;
237  if (j == event->GetClass()) {
238  buffer[bufferIndex] = 1.0;
239  }
240  }
241  } else {
242  buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
243  }
244  }
245  }
246 }
247 
248 //______________________________________________________________________________
249 template <>
250 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
251  size_t batchSize)
252 {
253  for (size_t i = 0; i < batchSize; i++) {
254  size_t sampleIndex = *sampleIterator++;
255  Event *event = std::get<0>(fData)[sampleIndex];
256  buffer[i] = static_cast<float>(event->GetWeight());
257  }
258 }
259 
260 //______________________________________________________________________________
261 template <>
262 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer,
263  IndexIterator_t sampleIterator, size_t batchSize)
264 {
265  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
266  size_t n = inputMatrix.GetNcols();
267 
268  for (size_t i = 0; i < batchSize; i++) {
269  size_t sampleIndex = *sampleIterator;
270  for (size_t j = 0; j < n; j++) {
271  size_t bufferIndex = j * batchSize + i;
272  buffer[bufferIndex] = inputMatrix(sampleIndex, j);
273  }
274  sampleIterator++;
275  }
276 }
277 
278 //______________________________________________________________________________
279 template <>
280 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
281  IndexIterator_t sampleIterator, size_t batchSize)
282 {
283  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
284  size_t n = outputMatrix.GetNcols();
285 
286  for (size_t i = 0; i < batchSize; i++) {
287  size_t sampleIndex = *sampleIterator;
288  for (size_t j = 0; j < n; j++) {
289  size_t bufferIndex = j * batchSize + i;
290  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
291  }
292  sampleIterator++;
293  }
294 }
295 
296 //______________________________________________________________________________
297 template <>
298 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
299  IndexIterator_t sampleIterator, size_t batchSize)
300 {
301  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
302  for (size_t i = 0; i < batchSize; i++) {
303  buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
304  sampleIterator++;
305  }
306 }
307 
308 //______________________________________________________________________________
309 template <>
310 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer, IndexIterator_t sampleIterator,
311  size_t batchSize)
312 {
313  Event *event = std::get<0>(fData)[0];
314  size_t n = event->GetNVariables();
315  for (size_t i = 0; i < batchSize; i++) {
316  size_t sampleIndex = * sampleIterator++;
317  event = std::get<0>(fData)[sampleIndex];
318  for (size_t j = 0; j < n; j++) {
319  size_t bufferIndex = j * batchSize + i;
320  buffer[bufferIndex] = event->GetValue(j);
321  }
322  }
323 }
324 
325 //______________________________________________________________________________
326 template <>
327 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
328  IndexIterator_t sampleIterator, size_t batchSize)
329 {
330  const DataSetInfo &info = std::get<1>(fData);
331  size_t n = buffer.GetSize() / batchSize;
332 
333  // Copy target(s).
334 
335  for (size_t i = 0; i < batchSize; i++) {
336  size_t sampleIndex = *sampleIterator++;
337  Event *event = std::get<0>(fData)[sampleIndex];
338  for (size_t j = 0; j < n; j++) {
339  // Copy output matrices.
340  size_t bufferIndex = j * batchSize + i;
341  // Classification
342  if (event->GetNTargets() == 0) {
343  // Binary.
344  if (n == 1) {
345  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
346  } else {
347  // Multiclass.
348  buffer[bufferIndex] = 0.0;
349  if (j == event->GetClass()) {
350  buffer[bufferIndex] = 1.0;
351  }
352  }
353  } else {
354  buffer[bufferIndex] = event->GetTarget(j);
355  }
356  }
357  }
358 }
359 
360 //______________________________________________________________________________
361 template <>
362 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
363  IndexIterator_t sampleIterator, size_t batchSize)
364 {
365  for (size_t i = 0; i < batchSize; i++) {
366  size_t sampleIndex = *sampleIterator++;
367  Event *event = std::get<0>(fData)[sampleIndex];
368  buffer[i] = static_cast<double>(event->GetWeight());
369  }
370 }
371 
372 //______________________________________________________________________________
373 template <>
374 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
375  IndexIterator_t sampleIterator)
376 {
377  const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
378 
379  if (fBatchDepth == 1) {
380  for (size_t i = 0; i < fBatchHeight; i++) {
381  size_t sampleIndex = *sampleIterator;
382  for (size_t j = 0; j < fBatchWidth; j++) {
383  size_t bufferIndex = j * fBatchHeight + i;
384  buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));
385  }
386  sampleIterator++;
387  }
388  } else {
389  for (size_t i = 0; i < fBatchDepth; i++) {
390  size_t sampleIndex = *sampleIterator;
391  for (size_t j = 0; j < fBatchHeight; j++) {
392  for (size_t k = 0; k < fBatchWidth; k++) {
393  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
394  buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
395  }
396  }
397  sampleIterator++;
398  }
399  }
400 }
401 
402 //______________________________________________________________________________
403 template <>
404 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
405  IndexIterator_t sampleIterator)
406 {
407  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
408  size_t n = outputMatrix.GetNcols();
409 
410  for (size_t i = 0; i < fBatchSize; i++) {
411  size_t sampleIndex = *sampleIterator;
412  for (size_t j = 0; j < n; j++) {
413  size_t bufferIndex = j * fBatchSize + i;
414  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
415  }
416  sampleIterator++;
417  }
418 }
419 
420 //______________________________________________________________________________
421 template <>
422 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
423  IndexIterator_t sampleIterator)
424 {
425  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
426  for (size_t i = 0; i < fBatchSize; i++) {
427  buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
428  sampleIterator++;
429  }
430 }
431 
432 //______________________________________________________________________________
433 template <>
434 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
435  IndexIterator_t sampleIterator)
436 {
437  // one event, one example in the batch
438 
439  if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
440  for (size_t i = 0; i < fBatchHeight; i++) {
441  size_t sampleIndex = *sampleIterator;
442  Event * event = std::get<0>(fData)[sampleIndex];
443  for (size_t j = 0; j < fBatchWidth; j++) {
444  size_t bufferIndex = j * fBatchHeight + i;
445  buffer[bufferIndex] = event->GetValue(j);
446  }
447  sampleIterator++;
448  }
449  } else if (fBatchDepth == fBatchSize) {
450  // batchDepth is batch size
451  for (size_t i = 0; i < fBatchDepth; i++) {
452  size_t sampleIndex = *sampleIterator;
453  Event * event = std::get<0>(fData)[sampleIndex];
454  for (size_t j = 0; j < fBatchHeight; j++) {
455  for (size_t k = 0; k < fBatchWidth; k++) {
456  // because of the column-major ordering
457  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
458  buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
459  }
460  }
461  sampleIterator++;
462  }
463  }
464  else {
465  std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
466  Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
467  R__ASSERT(0);
468  }
469 }
470 //______________________________________________________________________________
471 template <>
472 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
473  IndexIterator_t sampleIterator)
474 {
475  const DataSetInfo &info = std::get<1>(fData);
476  size_t n = buffer.GetSize() / fBatchSize;
477 
478  // Copy target(s).
479 
480  for (size_t i = 0; i < fBatchSize; i++) {
481  size_t sampleIndex = *sampleIterator++;
482  Event *event = std::get<0>(fData)[sampleIndex];
483  for (size_t j = 0; j < n; j++) {
484  // Copy output matrices.
485  size_t bufferIndex = j * fBatchSize + i;
486  // Classification
487  if (event->GetNTargets() == 0) {
488  if (n == 1) {
489  // Binary.
490  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
491  } else {
492  // Multiclass.
493  buffer[bufferIndex] = 0.0;
494  if (j == event->GetClass()) {
495  buffer[bufferIndex] = 1.0;
496  }
497  }
498  } else {
499  buffer[bufferIndex] = static_cast<Float_t>(event->GetTarget(j));
500  }
501  }
502  }
503 }
504 
505 //______________________________________________________________________________
506 template <>
507 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
508  IndexIterator_t sampleIterator)
509 {
510  for (size_t i = 0; i < fBatchSize; i++) {
511  size_t sampleIndex = *sampleIterator++;
512  Event *event = std::get<0>(fData)[sampleIndex];
513  buffer[i] = event->GetWeight();
514  }
515 }
516 
517 //______________________________________________________________________________
518 template <>
519 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
520  IndexIterator_t sampleIterator)
521 {
522  const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
523 
524  if (fBatchDepth == 1) {
525  for (size_t i = 0; i < fBatchHeight; i++) {
526  size_t sampleIndex = *sampleIterator;
527  for (size_t j = 0; j < fBatchWidth; j++) {
528  size_t bufferIndex = j * fBatchHeight + i;
529  buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));
530  }
531  sampleIterator++;
532  }
533  } else {
534  for (size_t i = 0; i < fBatchDepth; i++) {
535  size_t sampleIndex = *sampleIterator;
536  for (size_t j = 0; j < fBatchHeight; j++) {
537  for (size_t k = 0; k < fBatchWidth; k++) {
538  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
539  buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
540  }
541  }
542  sampleIterator++;
543  }
544  }
545 }
546 
547 //______________________________________________________________________________
548 template <>
549 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
550  IndexIterator_t sampleIterator)
551 {
552  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
553  size_t n = outputMatrix.GetNcols();
554 
555  for (size_t i = 0; i < fBatchSize; i++) {
556  size_t sampleIndex = *sampleIterator;
557  for (size_t j = 0; j < n; j++) {
558  size_t bufferIndex = j * fBatchSize + i;
559  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
560  }
561  sampleIterator++;
562  }
563 }
564 
565 //______________________________________________________________________________
566 template <>
567 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
568  IndexIterator_t sampleIterator)
569 {
570  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
571 
572  for (size_t i = 0; i < fBatchSize; i++) {
573  buffer[i] = weightMatrix(*sampleIterator, 0);
574  sampleIterator++;
575  }
576 }
577 
578 //______________________________________________________________________________
579 template <>
580 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
581  IndexIterator_t sampleIterator)
582 {
583  // one event, one example in the batch
584 
585  if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
586  for (size_t i = 0; i < fBatchHeight; i++) {
587  size_t sampleIndex = *sampleIterator;
588  Event * event = std::get<0>(fData)[sampleIndex];
589  for (size_t j = 0; j < fBatchWidth; j++) {
590  size_t bufferIndex = j * fBatchHeight + i;
591  buffer[bufferIndex] = event->GetValue(j);
592  }
593  sampleIterator++;
594  }
595  } else if (fBatchDepth == fBatchSize) {
596  // batchDepth is batch size
597  for (size_t i = 0; i < fBatchDepth; i++) {
598  size_t sampleIndex = *sampleIterator;
599  Event * event = std::get<0>(fData)[sampleIndex];
600  for (size_t j = 0; j < fBatchHeight; j++) {
601  for (size_t k = 0; k < fBatchWidth; k++) {
602  // because of the column-major ordering
603  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
604  buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
605  }
606  }
607  sampleIterator++;
608  }
609  }
610  else {
611  std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
612  Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
613  R__ASSERT(0);
614  }
615 }
616 
617 //______________________________________________________________________________
618 template <>
619 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
620  IndexIterator_t sampleIterator)
621 {
622  const DataSetInfo &info = std::get<1>(fData);
623  size_t n = buffer.GetSize() / fBatchSize;
624 
625  // Copy target(s).
626 
627  for (size_t i = 0; i < fBatchSize; i++) {
628  size_t sampleIndex = *sampleIterator++;
629  Event *event = std::get<0>(fData)[sampleIndex];
630  for (size_t j = 0; j < n; j++) {
631  // Copy output matrices.
632  size_t bufferIndex = j * fBatchSize + i;
633  // Classification
634  if (event->GetNTargets() == 0) {
635  if (n == 1) {
636  // Binary.
637  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
638  } else {
639  // Multiclass.
640  buffer[bufferIndex] = 0.0;
641  if (j == event->GetClass()) {
642  buffer[bufferIndex] = 1.0;
643  }
644  }
645  } else {
646  buffer[bufferIndex] = static_cast<Double_t>(event->GetTarget(j));
647  }
648  }
649  }
650 }
651 
652 //______________________________________________________________________________
653 template <>
654 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
655  IndexIterator_t sampleIterator)
656 {
657  for (size_t i = 0; i < fBatchSize; i++) {
658  size_t sampleIndex = *sampleIterator++;
659  Event *event = std::get<0>(fData)[sampleIndex];
660  buffer[i] = event->GetWeight();
661  }
662 }
663 
664 #if 0
665 //______________________________________________________________________________
666 template <>
667 TTensorBatch<TCuda<float> > TTensorDataLoader<TensorInput, TCuda<float> >::GetTensorBatch()
668 {
669  // After copying the data to the device, wrap the device buffer in the respective
670  // architectures matrix type
671  DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
672 
673  std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )
674  size_t jump = fBatchHeight * fBatchWidth;
675  for (size_t i = 0; i < fBatchSize; i++) {
676  DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
677  inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
678  }
679  Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
680  Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
681 
682  fBatchIndex++;
683  return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
684 }
685 
686 //______________________________________________________________________________
687 template <>
688 TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()
689 {
690  // After copying the data to the device, wrap the device buffer in the respective
691  // architectures matrix type
692  DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
693 
694  std::vector<Matrix_t> inputTensor;
695  size_t jump = fBatchHeight * fBatchWidth;
696  for (size_t i = 0; i < fBatchSize; i++) {
697  DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
698  inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
699  }
700  Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
701  Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
702 
703  fBatchIndex++;
704  return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
705 }
706 
707 //______________________________________________________________________________
708 template <>
709 TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()
710 {
711  // After copying the data to the device, wrap the device buffer in the respective
712  // architectures matrix type
713  DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
714 
715  std::vector<Matrix_t> inputTensor;
716  size_t jump = fBatchHeight * fBatchWidth;
717  for (size_t i = 0; i < fBatchSize; i++) {
718  DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
719  inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
720  }
721  Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
722  Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
723 
724  fBatchIndex++;
725  return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
726 }
727 
728 //______________________________________________________________________________
729 template <>
730 TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()
731 {
732  // After copying the data to the device, wrap the device buffer in the respective
733  // architectures matrix type
734  DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
735 
736  std::vector<Matrix_t> inputTensor;
737  size_t jump = fBatchHeight * fBatchWidth;
738  for (size_t i = 0; i < fBatchSize; i++) {
739  DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
740  inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
741  }
742  Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
743  Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
744 
745  fBatchIndex++;
746  return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
747 }
748 #endif
749 
750 // see file Cudnn/TensorDataLoader.cxx for Cudnn definitions
751 
752 //______________________________________________________________________________
753 // Explicit Instantiations.
754 
755 template class TCudaDeviceBuffer<float>;
756 template class TCudaDeviceBuffer<double>;
757 
758 template class TCudaHostBuffer<float>;
759 template class TCudaHostBuffer<double>;
760 
761 template class TDataLoader<MatrixInput_t, TCuda<float>>;
762 template class TDataLoader<TMVAInput_t, TCuda<float>>;
763 template class TDataLoader<MatrixInput_t, TCuda<double>>;
764 template class TDataLoader<TMVAInput_t, TCuda<double>>;
765 
766 template class TTensorDataLoader<TensorInput, TCuda<float> >;
767 template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;
768 template class TTensorDataLoader<TensorInput, TCuda<double >>;
769 template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;
770 
771 
772 } // TMVA
773 } // DNN