17 #ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUTENSOR 
   18 #define TMVA_DNN_ARCHITECTURES_CPU_CPUTENSOR 
   40 template <
typename AFloat>
 
   41 class TCpuTensor : 
public TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>> {
 
   46    friend class TCpuMatrix<AFloat>;
 
   48    using Shape_t = 
typename TMVA::Experimental::RTensor<AFloat>::Shape_t;
 
   49    using MemoryLayout = TMVA::Experimental::MemoryLayout;
 
   50    using Matrix_t = TCpuMatrix<AFloat>;
 
   51    using Scalar_t = AFloat;
 
   54    TCpuTensor(): TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(0), {0})
 
   58    TCpuTensor(
size_t n, 
size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
   59       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(n * m), {n, m}, memlayout)
 
   63    TCpuTensor(
size_t bsize, 
size_t depth, 
size_t hw, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
   64       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(bsize * depth * hw), {depth, hw, bsize}, memlayout)
 
   66       if (memlayout == MemoryLayout::RowMajor)
 
   67          this->ReshapeInplace({bsize, depth, hw});
 
   71    TCpuTensor(
size_t bsize, 
size_t depth, 
size_t height, 
size_t width,
 
   72               MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
   73       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(bsize * depth * height * width),
 
   74       {depth, height, width, bsize}, memlayout)
 
   76       if (memlayout == MemoryLayout::RowMajor)
 
   77          this->ReshapeInplace({bsize, depth, height, width});
 
   81    TCpuTensor(Shape_t shape, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
   82       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(TMVA::Experimental::Internal::GetSizeFromShape(shape)),
 
   88    TCpuTensor(AFloat *data, 
const Shape_t &shape,
 
   89               MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
   90       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(TMVA::Experimental::Internal::GetSizeFromShape(shape)), shape, memlayout)
 
   92       auto& container = *(this->GetContainer());
 
   93       for (
size_t i = 0; i <  this->GetSize(); ++i) container[i] = data[i];
 
  100    TCpuTensor(
const TCpuBuffer<AFloat>& buffer, Shape_t shape, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
  101       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(buffer), shape, memlayout) {
 
  102          R__ASSERT(this->GetSize() <= this->GetContainer()->GetSize());
 
  109    TCpuTensor(
const TCpuMatrix<AFloat> &matrix, 
size_t dim = 3, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
 
  110       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(matrix.GetBuffer()),{matrix.GetNrows(), matrix.GetNcols()}, memlayout)
 
  114          Shape_t shape = this->GetShape();
 
  116          if (this->GetLayout() == MemoryLayout::ColumnMajor) {
 
  117             shape.insert(shape.end(),dim-2, 1);
 
  119             shape.insert(shape.begin(), dim - 2, 1);
 
  121          this->ReshapeInplace(shape);
 
  129    operator TMatrixT<AFloat>() 
const {
 
  131       if (this->GetShape().size() == 2 || (this->GetShape().size() == 3 && GetFirstSize() == 1)) {
 
  132          TCpuMatrix<AFloat> temp = GetMatrix();
 
  136       return TMatrixT<AFloat>(1, this->GetSize(), this->GetData());
 
  142    AFloat *GetRawDataPointer() { 
return *(this->GetContainer()); }
 
  143    const AFloat *GetRawDataPointer()
 const { 
return *(this->GetContainer()); }
 
  146    const TCpuBuffer<AFloat> & GetDeviceBuffer()
     const {
return *(this->GetContainer());}
 
  147    TCpuBuffer<AFloat>       & GetDeviceBuffer()           {
return *(this->GetContainer());}
 
  150    size_t GetNoElements()
 const { 
return this->GetSize(); }
 
  156    size_t GetFirstSize()
 const 
  158       auto& shape = this->GetShape();
 
  159       return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape.back() : shape.front();
 
  162    size_t GetCSize()
 const 
  164       auto& shape = this->GetShape();
 
  165       if (shape.size() == 2)  
return 1;
 
  166       return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape.front() : shape[1]; 
 
  169    size_t GetHSize()
 const 
  171       auto& shape = this->GetShape();
 
  172       if (shape.size() == 2)  
return shape[0];
 
  173       if (shape.size() == 3)  
return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape[0] : shape[1] ;
 
  174       if (shape.size() >= 4)  
return shape[2] ;
 
  178    size_t GetWSize()
 const 
  180       auto& shape = this->GetShape();
 
  181       if (shape.size() == 2)  
return shape[1];
 
  182       if (shape.size() == 3)  
return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape[1] : shape[2] ;
 
  183       if (shape.size() >= 4)  
return shape[3] ;
 
  191    size_t GetNrows()
 const { 
return (GetLayout() == MemoryLayout::ColumnMajor ) ? this->GetStrides().back() : this->GetShape().front();}
 
  192    size_t GetNcols()
 const { 
return (GetLayout() == MemoryLayout::ColumnMajor ) ? this->GetShape().back() : this->GetStrides().front(); }
 
  195    MemoryLayout GetLayout()
 const { 
return this->GetMemoryLayout(); }
 
  198    TCpuMatrix<AFloat> GetMatrix()
 const 
  201       auto& shape = this->GetShape();
 
  203       for (
auto& shape_i : shape){
 
  208       assert(ndims <= 2 && shape.size() > 1);  
 
  209       return TCpuMatrix<AFloat>(*(this->GetContainer()), GetHSize(), GetWSize());
 
  213    TCpuTensor<AFloat> Reshape(Shape_t shape)
 const 
  215       TCpuTensor<AFloat> x(*
this);
 
  216       x.ReshapeInplace(shape);
 
  222       TCpuTensor<AFloat> At(
size_t i)
 
  224          auto &shape = this->GetShape();
 
  225          auto layout = this->GetMemoryLayout();
 
  226          Shape_t sliced_shape = (layout == MemoryLayout::RowMajor) ? Shape_t(shape.begin() + 1, shape.end())
 
  227                                                                    : Shape_t(shape.begin(), shape.end() - 1);
 
  229          size_t buffsize = (layout == MemoryLayout::RowMajor) ? this->GetStrides().front() : this->GetStrides().back();
 
  230          size_t offset = i * buffsize;
 
  232          return TCpuTensor<AFloat>(this->GetContainer()->GetSubBuffer(offset, buffsize), sliced_shape, layout);
 
  235       TCpuTensor<AFloat> At(
size_t i)
 const { 
return (
const_cast<TCpuTensor<AFloat> &
>(*
this)).At(i); }
 
  240          AFloat *data = *(this->GetContainer());
 
  241          for (
size_t i = 0; i < this->GetSize(); ++i)
 
  246       AFloat &operator()(
size_t i, 
size_t j)
 
  248          auto &shape = this->GetShape();
 
  249          assert(shape.size() == 2);
 
  250          return (this->GetMemoryLayout() == MemoryLayout::RowMajor) ? (*(this->GetContainer()))[i * shape[1] + j]
 
  251                                                                     : (*(this->GetContainer()))[j * shape[0] + i];
 
  256       AFloat &operator()(
size_t i, 
size_t j, 
size_t k)
 
  258          auto &shape = this->GetShape();
 
  259          assert(shape.size() == 3);
 
  261          return (this->GetMemoryLayout() == MemoryLayout::RowMajor)
 
  262                    ? (*(this->GetContainer()))[i * shape[1] * shape[2] + j * shape[2] + k]
 
  263                    : (*(this->GetContainer()))[i * shape[0] * shape[1] + k * shape[0] + j]; 
 
  267       AFloat operator()(
size_t i, 
size_t j)
 const 
  269          auto &shape = this->GetShape();
 
  270          assert(shape.size() == 2);
 
  271          return (this->GetMemoryLayout() == MemoryLayout::RowMajor) ? (this->GetData())[i * shape[1] + j]
 
  272                                                                     : (this->GetData())[j * shape[0] + i];
 
  275       AFloat operator()(
size_t i, 
size_t j, 
size_t k)
 const 
  277          auto &shape = this->GetShape();
 
  278          assert(shape.size() == 3);
 
  280          return (this->GetMemoryLayout() == MemoryLayout::RowMajor)
 
  281                    ? (this->GetData())[i * shape[1] * shape[2] + j * shape[2] + k]
 
  282                    : (this->GetData())[i * shape[0] * shape[1] + k * shape[0] + j]; 
 
  287       template <
typename Function_t>
 
  288       void Map(Function_t & f);
 
  292       template <
typename Function_t>
 
  293       void MapFrom(Function_t & f, 
const TCpuTensor<AFloat> &A);
 
  295       size_t GetBufferUseCount()
 const { 
return this->GetContainer()->GetUseCount(); }
 
  297       void Print(
const char *name = 
"Tensor")
 const 
  301          for (
size_t i = 0; i < this->GetSize(); i++)
 
  302             std::cout << (this->GetData())[i] << 
"  ";
 
  303          std::cout << std::endl;
 
  305       void PrintShape(
const char *name = 
"Tensor")
 const 
  307          std::string memlayout = (GetLayout() == MemoryLayout::RowMajor) ? 
"RowMajor" : 
"ColMajor";
 
  308          std::cout << name << 
" shape : { ";
 
  309          auto &shape = this->GetShape();
 
  310          for (
size_t i = 0; i < shape.size() - 1; ++i)
 
  311             std::cout << shape[i] << 
" , ";
 
  312          std::cout << shape.back() << 
" } " 
  313                    << 
" Layout : " << memlayout << std::endl;
 
  318 template <
typename AFloat>
 
  319 template <
typename Function_t>
 
  320 inline void TCpuTensor<AFloat>::Map(Function_t &f)
 
  322    AFloat *data = GetRawDataPointer();
 
  323    size_t nelements = GetNoElements();
 
  324    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
 
  326    auto ff = [data, &nsteps, &nelements, &f](UInt_t workerID) {
 
  327       size_t jMax = std::min(workerID + nsteps, nelements);
 
  328       for (
size_t j = workerID; j < jMax; ++j) {
 
  329          data[j] = f(data[j]);
 
  334    if (nsteps < nelements) {
 
  335       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
 
  341       R__ASSERT(nelements == nsteps);
 
  347 template <
typename AFloat>
 
  348 template <
typename Function_t>
 
  349 inline void TCpuTensor<AFloat>::MapFrom(Function_t &f, 
const TCpuTensor<AFloat> &A)
 
  351    AFloat *dataB = GetRawDataPointer();
 
  352    const AFloat *dataA = A.GetRawDataPointer();
 
  354    size_t nelements = GetNoElements();
 
  355    R__ASSERT(nelements == A.GetNoElements());
 
  356    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
 
  358    auto ff = [&dataB, &dataA, &nsteps, &nelements, &f](UInt_t workerID) {
 
  359       size_t jMax = std::min(workerID + nsteps, nelements);
 
  360       for (
size_t j = workerID; j < jMax; ++j) {
 
  361          dataB[j] = f(dataA[j]);
 
  365    if (nsteps < nelements) {
 
  366       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
 
  371       R__ASSERT(nelements == nsteps);