17 #ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX 
   18 #define TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX 
   21 #define DL_USE_MTE // use MT with tbb 
   33 #if defined(DEBUG_TMVA_TCPUMATRIX) 
   46 #define TMVA_DNN_PrintTCpuMatrix(mat, text)                                                                \ 
   48       auto _dpointer = mat.GetRawDataPointer();                                                            \ 
   49       if (_dpointer == NULL) {                                                                             \ 
   50          std::cout << #mat << " is null pointer" << std::endl;                                             \ 
   53       auto _nrows = mat.GetNrows();                                                                        \ 
   54       auto _ncols = mat.GetNcols();                                                                        \ 
   55       std::cout << "---------------------" << text << " " << #mat << "(" << _nrows << "," << _ncols << ")" \ 
   56                 << "--------------------" << std::endl;                                                    \ 
   57       for (size_t _i = 0; _i < _nrows; _i++) {                                                             \ 
   58          for (size_t _j = 0; _j < _ncols; _j++) {                                                          \ 
   59             std::cout << mat(_i, _j);                                                                      \ 
   60             if (_j < _ncols - 1)                                                                           \ 
   63          std::cout << std::endl;                                                                           \ 
   67 #define TMVA_DNN_PrintTCpuMatrix(mat, text) 
   86 template <
typename AFloat>
 
   89    static std::vector<AFloat> fOnes; 
 
   92    TCpuBuffer<AFloat> fBuffer; 
 
  105    TCpuBuffer<AFloat>& GetBuffer() {
return fBuffer;}
 
  106    const TCpuBuffer<AFloat>& GetBuffer()
 const {
return fBuffer;}
 
  109    static const AFloat *GetOnePointer() { 
return fOnes.data(); }
 
  111    static size_t GetOnePointerSize() { 
return fOnes.size(); }
 
  113    static void InitializeOneVector(
size_t n);
 
  115    TCpuMatrix() : fNCols(0), fNRows(0) {}
 
  118    TCpuMatrix(
size_t nRows, 
size_t nCols);
 
  121    TCpuMatrix(
const TMatrixT<AFloat> &);
 
  124    TCpuMatrix(
const TCpuBuffer<AFloat> &buffer, 
size_t m, 
size_t n);
 
  127    TCpuMatrix(
const TCpuMatrix &) = 
default;
 
  128    TCpuMatrix(TCpuMatrix &&) = 
default;
 
  129    TCpuMatrix &operator=(
const TCpuMatrix &) = 
default;
 
  130    TCpuMatrix &operator=(TCpuMatrix &&) = 
default;
 
  131    ~TCpuMatrix() = 
default;
 
  139    operator TMatrixT<AFloat>() 
const;
 
  143    template <
typename Function_t>
 
  144    void Map(Function_t &f);
 
  148    template <
typename Function_t>
 
  149    void MapFrom(Function_t &f, 
const TCpuMatrix &A);
 
  151    size_t GetNrows()
 const { 
return fNRows; }
 
  152    size_t GetNcols()
 const { 
return fNCols; }
 
  153    size_t GetNoElements()
 const { 
return fNRows * fNCols; }
 
  154    size_t GetSize()
 const { 
return fNRows * fNCols; }
 
  157    AFloat operator()(
size_t i, 
size_t j)
 const { 
return fBuffer[j * fNRows + i]; }
 
  158    AFloat &operator()(
size_t i, 
size_t j) { 
return fBuffer[j * fNRows + i]; }
 
  162    AFloat *GetRawDataPointer() { 
return fBuffer; }
 
  163    const AFloat *GetRawDataPointer()
 const { 
return fBuffer; }
 
  165    static Executor &GetThreadExecutor() { 
return TMVA::Config::Instance().GetThreadExecutor(); }
 
  168    static size_t GetNWorkItems(
size_t nelements);
 
  173       TCpuMatrix cpuMatrix = *
this;
 
  174       TMVA_DNN_PrintTCpuMatrix(cpuMatrix, 
"CpuMatrix");
 
  181 template <
typename AFloat>
 
  182 std::vector<AFloat> TCpuMatrix<AFloat>::fOnes{};
 
  186 template <
typename AFloat>
 
  187 size_t TCpuMatrix<AFloat>::GetNWorkItems(
size_t nElements)
 
  192    const size_t minElements = 1000;
 
  193    const size_t nCpu = TMVA::Config::Instance().GetNCpu();
 
  194    if (nElements <= minElements)
 
  196    if (nElements < nCpu * minElements) {
 
  197       size_t nt = nElements / minElements;
 
  198       return nElements / nt;
 
  200    return nElements / nCpu;
 
  206 template <
typename AFloat>
 
  207 template <
typename Function_t>
 
  208 inline void TCpuMatrix<AFloat>::Map(Function_t &f)
 
  210    AFloat *data = GetRawDataPointer();
 
  211    size_t nelements = GetNoElements();
 
  212    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
 
  214    auto ff = [data, &nsteps, &nelements, &f](UInt_t workerID) {
 
  215       size_t jMax = std::min(workerID + nsteps, nelements);
 
  216       for (
size_t j = workerID; j < jMax; ++j) {
 
  217          data[j] = f(data[j]);
 
  222    if (nsteps < nelements) {
 
  223       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
 
  229       R__ASSERT(nelements == nsteps);
 
  235 template <
typename AFloat>
 
  236 template <
typename Function_t>
 
  237 inline void TCpuMatrix<AFloat>::MapFrom(Function_t &f, 
const TCpuMatrix &A)
 
  239    AFloat *dataB = GetRawDataPointer();
 
  240    const AFloat *dataA = A.GetRawDataPointer();
 
  242    size_t nelements = GetNoElements();
 
  243    R__ASSERT(nelements == A.GetNoElements());
 
  244    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
 
  246    auto ff = [&dataB, &dataA, &nsteps, &nelements, &f](UInt_t workerID) {
 
  247       size_t jMax = std::min(workerID + nsteps, nelements);
 
  248       for (
size_t j = workerID; j < jMax; ++j) {
 
  249          dataB[j] = f(dataA[j]);
 
  253    if (nsteps < nelements) {
 
  254       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
 
  259       R__ASSERT(nelements == nsteps);
 
  264 template <
typename AFloat>
 
  265 void TCpuMatrix<AFloat>::Zero()
 
  267    for (
size_t j = 0; j < fNCols; j++) {
 
  268       for (
size_t i = 0; i < fNRows; i++) {