26 #include "cuda_runtime.h"
35 template <
typename AFloat>
36 void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
38 cudaFreeHost(*devicePointer);
39 delete[] devicePointer;
43 template <
typename AFloat>
44 TCudaHostBuffer<AFloat>::TCudaHostBuffer(
size_t size) : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
46 AFloat **pointer =
new AFloat *[1];
47 cudaMallocHost(pointer, size *
sizeof(AFloat));
48 fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
52 template <
typename AFloat>
53 TCudaHostBuffer<AFloat>::operator AFloat *()
const
55 return *fHostPointer + fOffset;
59 template <
typename AFloat>
60 TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(
size_t offset,
size_t size)
62 TCudaHostBuffer buffer = *
this;
63 buffer.fOffset = offset;
69 template <
typename AFloat>
70 void TCudaHostBuffer<AFloat>::SetConstVal(
const AFloat constVal)
72 std::fill(*fHostPointer, *fHostPointer+fSize, constVal);
78 template <
typename AFloat>
79 void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
81 cudaFree(*devicePointer);
82 delete[] devicePointer;
86 template <
typename AFloat>
87 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(
size_t size) : fOffset(0), fSize(size), fDestructor()
89 AFloat **pointer =
new AFloat *[1];
90 cudaMalloc(pointer, size *
sizeof(AFloat));
91 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
92 cudaStreamCreate(&fComputeStream);
96 template <
typename AFloat>
97 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(
size_t size, cudaStream_t stream)
98 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
100 AFloat **pointer =
new AFloat *[1];
101 cudaMalloc(pointer, size *
sizeof(AFloat));
102 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
106 template <
typename AFloat>
107 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer,
size_t size, cudaStream_t stream)
108 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
110 AFloat **pointer =
new AFloat *[1];
111 *pointer = devicePointer;
112 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
116 template <
typename AFloat>
117 TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(
size_t offset,
size_t size)
119 TCudaDeviceBuffer buffer = *
this;
120 buffer.fOffset = offset;
126 template <
typename AFloat>
127 TCudaDeviceBuffer<AFloat>::operator AFloat *()
const
129 return *fDevicePointer + fOffset;
133 template <
typename AFloat>
134 void TCudaDeviceBuffer<AFloat>::CopyFrom(
const TCudaHostBuffer<AFloat> &buffer)
const
136 cudaStreamSynchronize(fComputeStream);
137 cudaMemcpyAsync(*
this, buffer, fSize *
sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
141 template <
typename AFloat>
142 void TCudaDeviceBuffer<AFloat>::CopyTo(
const TCudaHostBuffer<AFloat> &buffer)
const
144 cudaMemcpyAsync(buffer, *
this, fSize *
sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
145 buffer.fComputeStream = fComputeStream;
150 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
153 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
154 size_t n = inputMatrix.GetNcols();
156 for (
size_t i = 0; i < batchSize; i++) {
157 size_t sampleIndex = *sampleIterator;
158 for (
size_t j = 0; j < n; j++) {
159 size_t bufferIndex = j * batchSize + i;
160 buffer[bufferIndex] =
static_cast<float>(inputMatrix(sampleIndex, j));
168 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer,
169 IndexIterator_t sampleIterator,
size_t batchSize)
171 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
172 size_t n = outputMatrix.GetNcols();
174 for (
size_t i = 0; i < batchSize; i++) {
175 size_t sampleIndex = *sampleIterator;
176 for (
size_t j = 0; j < n; j++) {
177 size_t bufferIndex = j * batchSize + i;
178 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
186 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer,
187 IndexIterator_t sampleIterator,
size_t batchSize)
189 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
190 for (
size_t i = 0; i < batchSize; i++) {
191 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
198 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
201 Event *
event = std::get<0>(fData)[0];
202 size_t n =
event->GetNVariables();
203 for (
size_t i = 0; i < batchSize; i++) {
204 size_t sampleIndex = * sampleIterator++;
205 event = std::get<0>(fData)[sampleIndex];
206 for (
size_t j = 0; j < n; j++) {
207 size_t bufferIndex = j * batchSize + i;
208 buffer[bufferIndex] =
static_cast<float>(
event->GetValue(j));
215 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
218 const DataSetInfo &info = std::get<1>(fData);
219 size_t n = buffer.GetSize() / batchSize;
223 for (
size_t i = 0; i < batchSize; i++) {
224 size_t sampleIndex = *sampleIterator++;
225 Event *
event = std::get<0>(fData)[sampleIndex];
226 for (
size_t j = 0; j < n; j++) {
228 size_t bufferIndex = j * batchSize + i;
230 if (event->GetNTargets() == 0) {
233 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
236 buffer[bufferIndex] = 0.0;
237 if (j == event->GetClass()) {
238 buffer[bufferIndex] = 1.0;
242 buffer[bufferIndex] =
static_cast<float>(
event->GetTarget(j));
250 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
253 for (
size_t i = 0; i < batchSize; i++) {
254 size_t sampleIndex = *sampleIterator++;
255 Event *
event = std::get<0>(fData)[sampleIndex];
256 buffer[i] =
static_cast<float>(
event->GetWeight());
262 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer,
263 IndexIterator_t sampleIterator,
size_t batchSize)
265 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
266 size_t n = inputMatrix.GetNcols();
268 for (
size_t i = 0; i < batchSize; i++) {
269 size_t sampleIndex = *sampleIterator;
270 for (
size_t j = 0; j < n; j++) {
271 size_t bufferIndex = j * batchSize + i;
272 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
280 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
281 IndexIterator_t sampleIterator,
size_t batchSize)
283 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
284 size_t n = outputMatrix.GetNcols();
286 for (
size_t i = 0; i < batchSize; i++) {
287 size_t sampleIndex = *sampleIterator;
288 for (
size_t j = 0; j < n; j++) {
289 size_t bufferIndex = j * batchSize + i;
290 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
298 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
299 IndexIterator_t sampleIterator,
size_t batchSize)
301 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
302 for (
size_t i = 0; i < batchSize; i++) {
303 buffer[i] =
static_cast<double>(weightMatrix(*sampleIterator, 0));
310 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer, IndexIterator_t sampleIterator,
313 Event *
event = std::get<0>(fData)[0];
314 size_t n =
event->GetNVariables();
315 for (
size_t i = 0; i < batchSize; i++) {
316 size_t sampleIndex = * sampleIterator++;
317 event = std::get<0>(fData)[sampleIndex];
318 for (
size_t j = 0; j < n; j++) {
319 size_t bufferIndex = j * batchSize + i;
320 buffer[bufferIndex] =
event->GetValue(j);
327 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
328 IndexIterator_t sampleIterator,
size_t batchSize)
330 const DataSetInfo &info = std::get<1>(fData);
331 size_t n = buffer.GetSize() / batchSize;
335 for (
size_t i = 0; i < batchSize; i++) {
336 size_t sampleIndex = *sampleIterator++;
337 Event *
event = std::get<0>(fData)[sampleIndex];
338 for (
size_t j = 0; j < n; j++) {
340 size_t bufferIndex = j * batchSize + i;
342 if (event->GetNTargets() == 0) {
345 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
348 buffer[bufferIndex] = 0.0;
349 if (j == event->GetClass()) {
350 buffer[bufferIndex] = 1.0;
354 buffer[bufferIndex] =
event->GetTarget(j);
362 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
363 IndexIterator_t sampleIterator,
size_t batchSize)
365 for (
size_t i = 0; i < batchSize; i++) {
366 size_t sampleIndex = *sampleIterator++;
367 Event *
event = std::get<0>(fData)[sampleIndex];
368 buffer[i] =
static_cast<double>(
event->GetWeight());
374 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
375 IndexIterator_t sampleIterator)
377 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
379 if (fBatchDepth == 1) {
380 for (
size_t i = 0; i < fBatchHeight; i++) {
381 size_t sampleIndex = *sampleIterator;
382 for (
size_t j = 0; j < fBatchWidth; j++) {
383 size_t bufferIndex = j * fBatchHeight + i;
384 buffer[bufferIndex] =
static_cast<float>(inputTensor[0](sampleIndex, j));
389 for (
size_t i = 0; i < fBatchDepth; i++) {
390 size_t sampleIndex = *sampleIterator;
391 for (
size_t j = 0; j < fBatchHeight; j++) {
392 for (
size_t k = 0; k < fBatchWidth; k++) {
393 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
394 buffer[bufferIndex] =
static_cast<float>(inputTensor[sampleIndex](j, k));
404 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
405 IndexIterator_t sampleIterator)
407 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
408 size_t n = outputMatrix.GetNcols();
410 for (
size_t i = 0; i < fBatchSize; i++) {
411 size_t sampleIndex = *sampleIterator;
412 for (
size_t j = 0; j < n; j++) {
413 size_t bufferIndex = j * fBatchSize + i;
414 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
422 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
423 IndexIterator_t sampleIterator)
425 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
426 for (
size_t i = 0; i < fBatchSize; i++) {
427 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
434 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
435 IndexIterator_t sampleIterator)
439 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
440 for (
size_t i = 0; i < fBatchHeight; i++) {
441 size_t sampleIndex = *sampleIterator;
442 Event *
event = std::get<0>(fData)[sampleIndex];
443 for (
size_t j = 0; j < fBatchWidth; j++) {
444 size_t bufferIndex = j * fBatchHeight + i;
445 buffer[bufferIndex] =
event->GetValue(j);
449 }
else if (fBatchDepth == fBatchSize) {
451 for (
size_t i = 0; i < fBatchDepth; i++) {
452 size_t sampleIndex = *sampleIterator;
453 Event *
event = std::get<0>(fData)[sampleIndex];
454 for (
size_t j = 0; j < fBatchHeight; j++) {
455 for (
size_t k = 0; k < fBatchWidth; k++) {
457 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
458 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
465 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
466 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
472 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
473 IndexIterator_t sampleIterator)
475 const DataSetInfo &info = std::get<1>(fData);
476 size_t n = buffer.GetSize() / fBatchSize;
480 for (
size_t i = 0; i < fBatchSize; i++) {
481 size_t sampleIndex = *sampleIterator++;
482 Event *
event = std::get<0>(fData)[sampleIndex];
483 for (
size_t j = 0; j < n; j++) {
485 size_t bufferIndex = j * fBatchSize + i;
487 if (event->GetNTargets() == 0) {
490 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
493 buffer[bufferIndex] = 0.0;
494 if (j == event->GetClass()) {
495 buffer[bufferIndex] = 1.0;
499 buffer[bufferIndex] =
static_cast<Float_t
>(
event->GetTarget(j));
507 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
508 IndexIterator_t sampleIterator)
510 for (
size_t i = 0; i < fBatchSize; i++) {
511 size_t sampleIndex = *sampleIterator++;
512 Event *
event = std::get<0>(fData)[sampleIndex];
513 buffer[i] =
event->GetWeight();
519 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
520 IndexIterator_t sampleIterator)
522 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
524 if (fBatchDepth == 1) {
525 for (
size_t i = 0; i < fBatchHeight; i++) {
526 size_t sampleIndex = *sampleIterator;
527 for (
size_t j = 0; j < fBatchWidth; j++) {
528 size_t bufferIndex = j * fBatchHeight + i;
529 buffer[bufferIndex] =
static_cast<float>(inputTensor[0](sampleIndex, j));
534 for (
size_t i = 0; i < fBatchDepth; i++) {
535 size_t sampleIndex = *sampleIterator;
536 for (
size_t j = 0; j < fBatchHeight; j++) {
537 for (
size_t k = 0; k < fBatchWidth; k++) {
538 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
539 buffer[bufferIndex] =
static_cast<float>(inputTensor[sampleIndex](j, k));
549 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
550 IndexIterator_t sampleIterator)
552 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
553 size_t n = outputMatrix.GetNcols();
555 for (
size_t i = 0; i < fBatchSize; i++) {
556 size_t sampleIndex = *sampleIterator;
557 for (
size_t j = 0; j < n; j++) {
558 size_t bufferIndex = j * fBatchSize + i;
559 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
567 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
568 IndexIterator_t sampleIterator)
570 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
572 for (
size_t i = 0; i < fBatchSize; i++) {
573 buffer[i] = weightMatrix(*sampleIterator, 0);
580 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
581 IndexIterator_t sampleIterator)
585 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
586 for (
size_t i = 0; i < fBatchHeight; i++) {
587 size_t sampleIndex = *sampleIterator;
588 Event *
event = std::get<0>(fData)[sampleIndex];
589 for (
size_t j = 0; j < fBatchWidth; j++) {
590 size_t bufferIndex = j * fBatchHeight + i;
591 buffer[bufferIndex] =
event->GetValue(j);
595 }
else if (fBatchDepth == fBatchSize) {
597 for (
size_t i = 0; i < fBatchDepth; i++) {
598 size_t sampleIndex = *sampleIterator;
599 Event *
event = std::get<0>(fData)[sampleIndex];
600 for (
size_t j = 0; j < fBatchHeight; j++) {
601 for (
size_t k = 0; k < fBatchWidth; k++) {
603 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
604 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
611 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
612 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
619 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
620 IndexIterator_t sampleIterator)
622 const DataSetInfo &info = std::get<1>(fData);
623 size_t n = buffer.GetSize() / fBatchSize;
627 for (
size_t i = 0; i < fBatchSize; i++) {
628 size_t sampleIndex = *sampleIterator++;
629 Event *
event = std::get<0>(fData)[sampleIndex];
630 for (
size_t j = 0; j < n; j++) {
632 size_t bufferIndex = j * fBatchSize + i;
634 if (event->GetNTargets() == 0) {
637 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
640 buffer[bufferIndex] = 0.0;
641 if (j == event->GetClass()) {
642 buffer[bufferIndex] = 1.0;
646 buffer[bufferIndex] =
static_cast<Double_t
>(
event->GetTarget(j));
654 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
655 IndexIterator_t sampleIterator)
657 for (
size_t i = 0; i < fBatchSize; i++) {
658 size_t sampleIndex = *sampleIterator++;
659 Event *
event = std::get<0>(fData)[sampleIndex];
660 buffer[i] =
event->GetWeight();
667 TTensorBatch<TCuda<float> > TTensorDataLoader<TensorInput, TCuda<float> >::GetTensorBatch()
671 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
673 std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )
674 size_t jump = fBatchHeight * fBatchWidth;
675 for (
size_t i = 0; i < fBatchSize; i++) {
676 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
677 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
679 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
680 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
683 return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
688 TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()
692 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
694 std::vector<Matrix_t> inputTensor;
695 size_t jump = fBatchHeight * fBatchWidth;
696 for (
size_t i = 0; i < fBatchSize; i++) {
697 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
698 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
700 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
701 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
704 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
709 TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()
713 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
715 std::vector<Matrix_t> inputTensor;
716 size_t jump = fBatchHeight * fBatchWidth;
717 for (
size_t i = 0; i < fBatchSize; i++) {
718 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
719 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
721 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
722 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
725 return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
730 TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()
734 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
736 std::vector<Matrix_t> inputTensor;
737 size_t jump = fBatchHeight * fBatchWidth;
738 for (
size_t i = 0; i < fBatchSize; i++) {
739 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
740 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
742 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
743 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
746 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
755 template class TCudaDeviceBuffer<float>;
756 template class TCudaDeviceBuffer<double>;
758 template class TCudaHostBuffer<float>;
759 template class TCudaHostBuffer<double>;
761 template class TDataLoader<MatrixInput_t, TCuda<float>>;
762 template class TDataLoader<TMVAInput_t, TCuda<float>>;
763 template class TDataLoader<MatrixInput_t, TCuda<double>>;
764 template class TDataLoader<TMVAInput_t, TCuda<double>>;
766 template class TTensorDataLoader<TensorInput, TCuda<float> >;
767 template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;
768 template class TTensorDataLoader<TensorInput, TCuda<double >>;
769 template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;