26 #include "cuda_runtime.h" 
   35 template <
typename AFloat>
 
   36 void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
 
   38    cudaFreeHost(*devicePointer);
 
   39    delete[] devicePointer;
 
   43 template <
typename AFloat>
 
   44 TCudaHostBuffer<AFloat>::TCudaHostBuffer(
size_t size) : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
 
   46    AFloat **pointer = 
new AFloat *[1];
 
   47    cudaMallocHost(pointer, size * 
sizeof(AFloat));
 
   48    fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
 
   52 template <
typename AFloat>
 
   53 TCudaHostBuffer<AFloat>::operator AFloat *() 
const 
   55    return *fHostPointer + fOffset;
 
   59 template <
typename AFloat>
 
   60 TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(
size_t offset, 
size_t size)
 
   62    TCudaHostBuffer buffer = *
this;
 
   63    buffer.fOffset = offset;
 
   69 template <
typename AFloat>
 
   70 void TCudaHostBuffer<AFloat>::SetConstVal(
const AFloat constVal)
 
   72    std::fill(*fHostPointer, *fHostPointer+fSize, constVal);
 
   78 template <
typename AFloat>
 
   79 void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)
 
   81    cudaFree(*devicePointer);
 
   82    delete[] devicePointer;
 
   86 template <
typename AFloat>
 
   87 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(
size_t size) : fOffset(0), fSize(size), fDestructor()
 
   89    AFloat **pointer = 
new AFloat *[1];
 
   90    cudaMalloc(pointer, size * 
sizeof(AFloat));
 
   91    fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
 
   92    cudaStreamCreate(&fComputeStream);
 
   96 template <
typename AFloat>
 
   97 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(
size_t size, cudaStream_t stream)
 
   98    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
 
  100    AFloat **pointer = 
new AFloat *[1];
 
  101    cudaMalloc(pointer, size * 
sizeof(AFloat));
 
  102    fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
 
  106 template <
typename AFloat>
 
  107 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, 
size_t size, cudaStream_t stream)
 
  108    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
 
  110    AFloat **pointer = 
new AFloat *[1];
 
  111    *pointer = devicePointer;
 
  112    fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
 
  116 template <
typename AFloat>
 
  117 TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(
size_t offset, 
size_t size)
 
  119    TCudaDeviceBuffer buffer = *
this;
 
  120    buffer.fOffset = offset;
 
  126 template <
typename AFloat>
 
  127 TCudaDeviceBuffer<AFloat>::operator AFloat *() 
const 
  129    return *fDevicePointer + fOffset;
 
  133 template <
typename AFloat>
 
  134 void TCudaDeviceBuffer<AFloat>::CopyFrom(
const TCudaHostBuffer<AFloat> &buffer)
 const 
  136    cudaStreamSynchronize(fComputeStream);
 
  137    cudaMemcpyAsync(*
this, buffer, fSize * 
sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
 
  141 template <
typename AFloat>
 
  142 void TCudaDeviceBuffer<AFloat>::CopyTo(
const TCudaHostBuffer<AFloat> &buffer)
 const 
  144    cudaMemcpyAsync(buffer, *
this, fSize * 
sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
 
  145    buffer.fComputeStream = fComputeStream;
 
  150 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
 
  153    const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
 
  154    size_t n = inputMatrix.GetNcols();
 
  156    for (
size_t i = 0; i < batchSize; i++) {
 
  157       size_t sampleIndex = *sampleIterator;
 
  158       for (
size_t j = 0; j < n; j++) {
 
  159          size_t bufferIndex = j * batchSize + i;
 
  160          buffer[bufferIndex] = 
static_cast<float>(inputMatrix(sampleIndex, j));
 
  168 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer,
 
  169                                                           IndexIterator_t sampleIterator, 
size_t batchSize)
 
  171    const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
 
  172    size_t n = outputMatrix.GetNcols();
 
  174    for (
size_t i = 0; i < batchSize; i++) {
 
  175       size_t sampleIndex = *sampleIterator;
 
  176       for (
size_t j = 0; j < n; j++) {
 
  177          size_t bufferIndex = j * batchSize + i;
 
  178          buffer[bufferIndex] = 
static_cast<float>(outputMatrix(sampleIndex, j));
 
  186 void TDataLoader<MatrixInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer,
 
  187                                                            IndexIterator_t sampleIterator, 
size_t batchSize)
 
  189    const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
 
  190    for (
size_t i = 0; i < batchSize; i++) {
 
  191       buffer[i] = 
static_cast<float>(weightMatrix(*sampleIterator, 0));
 
  198 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
 
  201    Event *
event = std::get<0>(fData)[0];
 
  202    size_t n  = 
event->GetNVariables();
 
  203    for (
size_t i = 0; i < batchSize; i++) {
 
  204       size_t sampleIndex = * sampleIterator++;
 
  205       event = std::get<0>(fData)[sampleIndex];
 
  206       for (
size_t j = 0; j < n; j++) {
 
  207          size_t bufferIndex = j * batchSize + i;
 
  208          buffer[bufferIndex] = 
static_cast<float>(
event->GetValue(j));
 
  215 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
 
  218   const DataSetInfo &info = std::get<1>(fData);
 
  219   size_t n = buffer.GetSize() / batchSize;
 
  223   for (
size_t i = 0; i < batchSize; i++) {
 
  224     size_t sampleIndex = *sampleIterator++;
 
  225     Event *
event = std::get<0>(fData)[sampleIndex];
 
  226     for (
size_t j = 0; j < n; j++) {
 
  228       size_t bufferIndex = j * batchSize + i;
 
  230       if (event->GetNTargets() == 0) {
 
  233           buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
 
  236           buffer[bufferIndex] = 0.0;
 
  237           if (j == event->GetClass()) {
 
  238             buffer[bufferIndex] = 1.0;
 
  242         buffer[bufferIndex] = 
static_cast<float>(
event->GetTarget(j));
 
  250 void TDataLoader<TMVAInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,
 
  253    for (
size_t i = 0; i < batchSize; i++) {
 
  254       size_t sampleIndex = *sampleIterator++;
 
  255       Event *
event = std::get<0>(fData)[sampleIndex];
 
  256       buffer[i] = 
static_cast<float>(
event->GetWeight());
 
  262 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer,
 
  263                                                           IndexIterator_t sampleIterator, 
size_t batchSize)
 
  265    const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
 
  266    size_t n = inputMatrix.GetNcols();
 
  268    for (
size_t i = 0; i < batchSize; i++) {
 
  269       size_t sampleIndex = *sampleIterator;
 
  270       for (
size_t j = 0; j < n; j++) {
 
  271          size_t bufferIndex = j * batchSize + i;
 
  272          buffer[bufferIndex] = inputMatrix(sampleIndex, j);
 
  280 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
 
  281                                                            IndexIterator_t sampleIterator, 
size_t batchSize)
 
  283    const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
 
  284    size_t n = outputMatrix.GetNcols();
 
  286    for (
size_t i = 0; i < batchSize; i++) {
 
  287       size_t sampleIndex = *sampleIterator;
 
  288       for (
size_t j = 0; j < n; j++) {
 
  289          size_t bufferIndex = j * batchSize + i;
 
  290          buffer[bufferIndex] = outputMatrix(sampleIndex, j);
 
  298 void TDataLoader<MatrixInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
 
  299                                                             IndexIterator_t sampleIterator, 
size_t batchSize)
 
  301    const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
 
  302    for (
size_t i = 0; i < batchSize; i++) {
 
  303       buffer[i] = 
static_cast<double>(weightMatrix(*sampleIterator, 0));
 
  310 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer, IndexIterator_t sampleIterator,
 
  313    Event *
event = std::get<0>(fData)[0];
 
  314    size_t n  = 
event->GetNVariables();
 
  315    for (
size_t i = 0; i < batchSize; i++) {
 
  316       size_t sampleIndex = * sampleIterator++;
 
  317       event = std::get<0>(fData)[sampleIndex];
 
  318       for (
size_t j = 0; j < n; j++) {
 
  319          size_t bufferIndex = j * batchSize + i;
 
  320          buffer[bufferIndex] = 
event->GetValue(j);
 
  327 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,
 
  328                                                          IndexIterator_t sampleIterator, 
size_t batchSize)
 
  330   const DataSetInfo &info = std::get<1>(fData);
 
  331   size_t n = buffer.GetSize() / batchSize;
 
  335   for (
size_t i = 0; i < batchSize; i++) {
 
  336     size_t sampleIndex = *sampleIterator++;
 
  337     Event *
event = std::get<0>(fData)[sampleIndex];
 
  338     for (
size_t j = 0; j < n; j++) {
 
  340       size_t bufferIndex = j * batchSize + i;
 
  342       if (event->GetNTargets() == 0) {
 
  345           buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
 
  348           buffer[bufferIndex] = 0.0;
 
  349           if (j == event->GetClass()) {
 
  350             buffer[bufferIndex] = 1.0;
 
  354         buffer[bufferIndex] = 
event->GetTarget(j);
 
  362 void TDataLoader<TMVAInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,
 
  363                                                           IndexIterator_t sampleIterator, 
size_t batchSize)
 
  365    for (
size_t i = 0; i < batchSize; i++) {
 
  366       size_t sampleIndex = *sampleIterator++;
 
  367       Event *
event = std::get<0>(fData)[sampleIndex];
 
  368       buffer[i] = 
static_cast<double>(
event->GetWeight());
 
  374 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
 
  375                                                                    IndexIterator_t sampleIterator)
 
  377    const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
 
  379    if (fBatchDepth == 1) {
 
  380       for (
size_t i = 0; i < fBatchHeight; i++) {
 
  381          size_t sampleIndex = *sampleIterator;
 
  382          for (
size_t j = 0; j < fBatchWidth; j++) {
 
  383             size_t bufferIndex = j * fBatchHeight + i;
 
  384             buffer[bufferIndex] = 
static_cast<float>(inputTensor[0](sampleIndex, j));
 
  389       for (
size_t i = 0; i < fBatchDepth; i++) {
 
  390          size_t sampleIndex = *sampleIterator;
 
  391          for (
size_t j = 0; j < fBatchHeight; j++) {
 
  392             for (
size_t k = 0; k < fBatchWidth; k++) {
 
  393                size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
 
  394                buffer[bufferIndex] = 
static_cast<float>(inputTensor[sampleIndex](j, k));
 
  404 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
 
  405                                                                     IndexIterator_t sampleIterator)
 
  407    const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
 
  408    size_t n = outputMatrix.GetNcols();
 
  410    for (
size_t i = 0; i < fBatchSize; i++) {
 
  411       size_t sampleIndex = *sampleIterator;
 
  412       for (
size_t j = 0; j < n; j++) {
 
  413          size_t bufferIndex = j * fBatchSize + i;
 
  414          buffer[bufferIndex] = 
static_cast<float>(outputMatrix(sampleIndex, j));
 
  422 void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
 
  423                                                                      IndexIterator_t sampleIterator)
 
  425    const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
 
  426    for (
size_t i = 0; i < fBatchSize; i++) {
 
  427       buffer[i] = 
static_cast<float>(weightMatrix(*sampleIterator, 0));
 
  434 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,
 
  435                                                                    IndexIterator_t sampleIterator)
 
  439    if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
 
  440       for (
size_t i = 0; i < fBatchHeight; i++) {
 
  441          size_t sampleIndex = *sampleIterator;
 
  442          Event * 
event = std::get<0>(fData)[sampleIndex];
 
  443          for (
size_t j = 0; j < fBatchWidth; j++) {
 
  444             size_t bufferIndex = j * fBatchHeight + i;
 
  445             buffer[bufferIndex] = 
event->GetValue(j);
 
  449    } 
else if (fBatchDepth == fBatchSize) {
 
  451       for (
size_t i = 0; i < fBatchDepth; i++) {
 
  452          size_t sampleIndex = *sampleIterator;
 
  453          Event * 
event = std::get<0>(fData)[sampleIndex];
 
  454          for (
size_t j = 0; j < fBatchHeight; j++) {
 
  455             for (
size_t k = 0; k < fBatchWidth; k++) {
 
  457                size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
 
  458                buffer[bufferIndex] = 
event->GetValue(j * fBatchWidth + k);
 
  465       std::cout  << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
 
  466       Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
 
  472 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,
 
  473                                                                     IndexIterator_t sampleIterator)
 
  475    const DataSetInfo &info = std::get<1>(fData);
 
  476    size_t n = buffer.GetSize() / fBatchSize;
 
  480    for (
size_t i = 0; i < fBatchSize; i++) {
 
  481       size_t sampleIndex = *sampleIterator++;
 
  482       Event *
event = std::get<0>(fData)[sampleIndex];
 
  483       for (
size_t j = 0; j < n; j++) {
 
  485          size_t bufferIndex = j * fBatchSize + i;
 
  487          if (event->GetNTargets() == 0) {
 
  490                buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
 
  493                buffer[bufferIndex] = 0.0;
 
  494                if (j == event->GetClass()) {
 
  495                   buffer[bufferIndex] = 1.0;
 
  499             buffer[bufferIndex] = 
static_cast<Float_t
>(
event->GetTarget(j));
 
  507 void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,
 
  508                                                                      IndexIterator_t sampleIterator)
 
  510    for (
size_t i = 0; i < fBatchSize; i++) {
 
  511       size_t sampleIndex = *sampleIterator++;
 
  512       Event *
event = std::get<0>(fData)[sampleIndex];
 
  513       buffer[i] = 
event->GetWeight();
 
  519 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
 
  520                                                                     IndexIterator_t sampleIterator)
 
  522    const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
 
  524    if (fBatchDepth == 1) {
 
  525       for (
size_t i = 0; i < fBatchHeight; i++) {
 
  526          size_t sampleIndex = *sampleIterator;
 
  527          for (
size_t j = 0; j < fBatchWidth; j++) {
 
  528             size_t bufferIndex = j * fBatchHeight + i;
 
  529             buffer[bufferIndex] = 
static_cast<float>(inputTensor[0](sampleIndex, j));
 
  534       for (
size_t i = 0; i < fBatchDepth; i++) {
 
  535          size_t sampleIndex = *sampleIterator;
 
  536          for (
size_t j = 0; j < fBatchHeight; j++) {
 
  537             for (
size_t k = 0; k < fBatchWidth; k++) {
 
  538                size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
 
  539                buffer[bufferIndex] = 
static_cast<float>(inputTensor[sampleIndex](j, k));
 
  549 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
 
  550                                                                      IndexIterator_t sampleIterator)
 
  552    const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
 
  553    size_t n = outputMatrix.GetNcols();
 
  555    for (
size_t i = 0; i < fBatchSize; i++) {
 
  556       size_t sampleIndex = *sampleIterator;
 
  557       for (
size_t j = 0; j < n; j++) {
 
  558          size_t bufferIndex = j * fBatchSize + i;
 
  559          buffer[bufferIndex] = outputMatrix(sampleIndex, j);
 
  567 void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
 
  568                                                                       IndexIterator_t sampleIterator)
 
  570    const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
 
  572    for (
size_t i = 0; i < fBatchSize; i++) {
 
  573       buffer[i] = weightMatrix(*sampleIterator, 0);
 
  580 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,
 
  581                                                                     IndexIterator_t sampleIterator)
 
  585    if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
 
  586       for (
size_t i = 0; i < fBatchHeight; i++) {
 
  587          size_t sampleIndex = *sampleIterator;
 
  588          Event * 
event = std::get<0>(fData)[sampleIndex];
 
  589          for (
size_t j = 0; j < fBatchWidth; j++) {
 
  590             size_t bufferIndex = j * fBatchHeight + i;
 
  591             buffer[bufferIndex] = 
event->GetValue(j);
 
  595    } 
else if (fBatchDepth == fBatchSize) {
 
  597       for (
size_t i = 0; i < fBatchDepth; i++) {
 
  598          size_t sampleIndex = *sampleIterator;
 
  599          Event * 
event = std::get<0>(fData)[sampleIndex];
 
  600          for (
size_t j = 0; j < fBatchHeight; j++) {
 
  601             for (
size_t k = 0; k < fBatchWidth; k++) {
 
  603                size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
 
  604                buffer[bufferIndex] = 
event->GetValue(j * fBatchWidth + k);
 
  611       std::cout  << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
 
  612       Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
 
  619 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,
 
  620                                                                      IndexIterator_t sampleIterator)
 
  622    const DataSetInfo &info = std::get<1>(fData);
 
  623    size_t n = buffer.GetSize() / fBatchSize;
 
  627    for (
size_t i = 0; i < fBatchSize; i++) {
 
  628       size_t sampleIndex = *sampleIterator++;
 
  629       Event *
event = std::get<0>(fData)[sampleIndex];
 
  630       for (
size_t j = 0; j < n; j++) {
 
  632          size_t bufferIndex = j * fBatchSize + i;
 
  634          if (event->GetNTargets() == 0) {
 
  637                buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
 
  640                buffer[bufferIndex] = 0.0;
 
  641                if (j == event->GetClass()) {
 
  642                   buffer[bufferIndex] = 1.0;
 
  646             buffer[bufferIndex] = 
static_cast<Double_t
>(
event->GetTarget(j));
 
  654 void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,
 
  655                                                                       IndexIterator_t sampleIterator)
 
  657    for (
size_t i = 0; i < fBatchSize; i++) {
 
  658       size_t sampleIndex = *sampleIterator++;
 
  659       Event *
event = std::get<0>(fData)[sampleIndex];
 
  660       buffer[i] = 
event->GetWeight();
 
  667 TTensorBatch<TCuda<float> > TTensorDataLoader<TensorInput, TCuda<float> >::GetTensorBatch()
 
  671    DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
 
  673    std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )
 
  674    size_t jump = fBatchHeight * fBatchWidth;
 
  675    for (
size_t i = 0; i < fBatchSize; i++) {
 
  676       DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
 
  677       inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
 
  679    Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  680    Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  683    return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
 
  688 TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()
 
  692    DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
 
  694    std::vector<Matrix_t> inputTensor;
 
  695    size_t jump = fBatchHeight * fBatchWidth;
 
  696    for (
size_t i = 0; i < fBatchSize; i++) {
 
  697       DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
 
  698       inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
 
  700    Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  701    Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  704    return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
 
  709 TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()
 
  713    DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
 
  715    std::vector<Matrix_t> inputTensor;
 
  716    size_t jump = fBatchHeight * fBatchWidth;
 
  717    for (
size_t i = 0; i < fBatchSize; i++) {
 
  718       DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
 
  719       inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
 
  721    Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  722    Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  725    return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
 
  730 TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()
 
  734    DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
 
  736    std::vector<Matrix_t> inputTensor;
 
  737    size_t jump = fBatchHeight * fBatchWidth;
 
  738    for (
size_t i = 0; i < fBatchSize; i++) {
 
  739       DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
 
  740       inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
 
  742    Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  743    Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
 
  746    return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
 
  755 template class TCudaDeviceBuffer<float>;
 
  756 template class TCudaDeviceBuffer<double>;
 
  758 template class TCudaHostBuffer<float>;
 
  759 template class TCudaHostBuffer<double>;
 
  761 template class TDataLoader<MatrixInput_t, TCuda<float>>;
 
  762 template class TDataLoader<TMVAInput_t, TCuda<float>>;
 
  763 template class TDataLoader<MatrixInput_t, TCuda<double>>;
 
  764 template class TDataLoader<TMVAInput_t, TCuda<double>>;
 
  766 template class TTensorDataLoader<TensorInput, TCuda<float> >;
 
  767 template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;
 
  768 template class TTensorDataLoader<TensorInput, TCuda<double >>;
 
  769 template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;