17 #ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
18 #define TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
21 #define DL_USE_MTE // use MT with tbb
33 #if defined(DEBUG_TMVA_TCPUMATRIX)
46 #define TMVA_DNN_PrintTCpuMatrix(mat, text) \
48 auto _dpointer = mat.GetRawDataPointer(); \
49 if (_dpointer == NULL) { \
50 std::cout << #mat << " is null pointer" << std::endl; \
53 auto _nrows = mat.GetNrows(); \
54 auto _ncols = mat.GetNcols(); \
55 std::cout << "---------------------" << text << " " << #mat << "(" << _nrows << "," << _ncols << ")" \
56 << "--------------------" << std::endl; \
57 for (size_t _i = 0; _i < _nrows; _i++) { \
58 for (size_t _j = 0; _j < _ncols; _j++) { \
59 std::cout << mat(_i, _j); \
60 if (_j < _ncols - 1) \
63 std::cout << std::endl; \
67 #define TMVA_DNN_PrintTCpuMatrix(mat, text)
86 template <
typename AFloat>
89 static std::vector<AFloat> fOnes;
92 TCpuBuffer<AFloat> fBuffer;
105 TCpuBuffer<AFloat>& GetBuffer() {
return fBuffer;}
106 const TCpuBuffer<AFloat>& GetBuffer()
const {
return fBuffer;}
109 static const AFloat *GetOnePointer() {
return fOnes.data(); }
111 static size_t GetOnePointerSize() {
return fOnes.size(); }
113 static void InitializeOneVector(
size_t n);
115 TCpuMatrix() : fNCols(0), fNRows(0) {}
118 TCpuMatrix(
size_t nRows,
size_t nCols);
121 TCpuMatrix(
const TMatrixT<AFloat> &);
124 TCpuMatrix(
const TCpuBuffer<AFloat> &buffer,
size_t m,
size_t n);
127 TCpuMatrix(
const TCpuMatrix &) =
default;
128 TCpuMatrix(TCpuMatrix &&) =
default;
129 TCpuMatrix &operator=(
const TCpuMatrix &) =
default;
130 TCpuMatrix &operator=(TCpuMatrix &&) =
default;
131 ~TCpuMatrix() =
default;
139 operator TMatrixT<AFloat>()
const;
143 template <
typename Function_t>
144 void Map(Function_t &f);
148 template <
typename Function_t>
149 void MapFrom(Function_t &f,
const TCpuMatrix &A);
151 size_t GetNrows()
const {
return fNRows; }
152 size_t GetNcols()
const {
return fNCols; }
153 size_t GetNoElements()
const {
return fNRows * fNCols; }
154 size_t GetSize()
const {
return fNRows * fNCols; }
157 AFloat operator()(
size_t i,
size_t j)
const {
return fBuffer[j * fNRows + i]; }
158 AFloat &operator()(
size_t i,
size_t j) {
return fBuffer[j * fNRows + i]; }
162 AFloat *GetRawDataPointer() {
return fBuffer; }
163 const AFloat *GetRawDataPointer()
const {
return fBuffer; }
165 static Executor &GetThreadExecutor() {
return TMVA::Config::Instance().GetThreadExecutor(); }
168 static size_t GetNWorkItems(
size_t nelements);
173 TCpuMatrix cpuMatrix = *
this;
174 TMVA_DNN_PrintTCpuMatrix(cpuMatrix,
"CpuMatrix");
181 template <
typename AFloat>
182 std::vector<AFloat> TCpuMatrix<AFloat>::fOnes{};
186 template <
typename AFloat>
187 size_t TCpuMatrix<AFloat>::GetNWorkItems(
size_t nElements)
192 const size_t minElements = 1000;
193 const size_t nCpu = TMVA::Config::Instance().GetNCpu();
194 if (nElements <= minElements)
196 if (nElements < nCpu * minElements) {
197 size_t nt = nElements / minElements;
198 return nElements / nt;
200 return nElements / nCpu;
206 template <
typename AFloat>
207 template <
typename Function_t>
208 inline void TCpuMatrix<AFloat>::Map(Function_t &f)
210 AFloat *data = GetRawDataPointer();
211 size_t nelements = GetNoElements();
212 size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
214 auto ff = [data, &nsteps, &nelements, &f](UInt_t workerID) {
215 size_t jMax = std::min(workerID + nsteps, nelements);
216 for (
size_t j = workerID; j < jMax; ++j) {
217 data[j] = f(data[j]);
222 if (nsteps < nelements) {
223 TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
229 R__ASSERT(nelements == nsteps);
235 template <
typename AFloat>
236 template <
typename Function_t>
237 inline void TCpuMatrix<AFloat>::MapFrom(Function_t &f,
const TCpuMatrix &A)
239 AFloat *dataB = GetRawDataPointer();
240 const AFloat *dataA = A.GetRawDataPointer();
242 size_t nelements = GetNoElements();
243 R__ASSERT(nelements == A.GetNoElements());
244 size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
246 auto ff = [&dataB, &dataA, &nsteps, &nelements, &f](UInt_t workerID) {
247 size_t jMax = std::min(workerID + nsteps, nelements);
248 for (
size_t j = workerID; j < jMax; ++j) {
249 dataB[j] = f(dataA[j]);
253 if (nsteps < nelements) {
254 TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
259 R__ASSERT(nelements == nsteps);
264 template <
typename AFloat>
265 void TCpuMatrix<AFloat>::Zero()
267 for (
size_t j = 0; j < fNCols; j++) {
268 for (
size_t i = 0; i < fNRows; i++) {