Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
CudaTensor.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
21 
22 
23 #include <vector>
24 #include <cstring>
25 #include <cassert>
26 #include <iostream>
27 
28 #include "RConfigure.h"
29 #include "TMatrixT.h"
30 #include "CudaBuffers.h"
31 #include "CudaMatrix.h"
32 //#include "TMVA/RTensor.hxx"
33 
34 #ifdef R__HAS_CUDNN
35 #include "cudnn.h"
36 #define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
37 #endif
38 
39 namespace TMVA {
40 
41 
42 
43 #ifndef TMVA_RTENSOR
44 
45 namespace Experimental {
46 /// Memory layout type (copy from RTensor.hxx)
47 enum class MemoryLayout : uint8_t {
48  RowMajor = 0x01,
49  ColumnMajor = 0x02
50 };
51 }
52 #endif
53 
54 namespace DNN {
55 
56 using MemoryLayout = TMVA::Experimental::MemoryLayout;
57 
58 #ifdef R__HAS_CUDNN
59 /**
60  * Function to handle the status output of cuDNN function calls. See also
61  * CUDACHECK in CudaMatrix.h.
62  */
63 inline void cudnnError(cudnnStatus_t status, const char *file, int line, bool abort=true)
64 {
65  if (status != CUDNN_STATUS_SUCCESS) {
66  fprintf(stderr, "CUDNN Error: %s %s %d\n", cudnnGetErrorString(status), file, line);
67  if (abort)
68  exit(status);
69  }
70 }
71 #endif
72 //____________________________________________________________________________
73 //
74 // Cuda Tensor
75 //____________________________________________________________________________
76 
77 /** TCudaTensor Class
78  *
79  * The TCudaTensor class extends the TCudaMatrix class for dimensions > 2.
80  *
81  */
82 template<typename AFloat>
83 class TCudaTensor
84 {
85 public:
86 
87  using Shape_t = std::vector<size_t>;
88  using MemoryLayout = TMVA::Experimental:: MemoryLayout;
89  using Scalar_t = AFloat;
90 
91 
92 private:
93 
94 #ifdef R__HAS_CUDNN
95  struct TensorDescriptor {
96  cudnnTensorDescriptor_t fCudnnDesc;
97  };
98 
99  static std::vector<cudnnHandle_t> fCudnnHandle; ///< Holds the cuddn library context (one for every CUDA stream)
100 
101  static cudnnDataType_t fDataType; ///< Cudnn datatype used for the tensor
102 #else
103  struct TensorDescriptor {
104  };
105 #endif
106 
107  /** For each GPU device keep the CUDA streams in which tensors are used.
108  * Instances belonging to the same stream on the same deviceshare a
109  * cudnn library handel to keep cudnn contexts seperated */
110  //static std::vector<std::vector<int> > fInstances;
111  static std::vector<int> fInstances;
112 
113  /** The shape vector (size of dimensions) needs to be ordered as no. channels,
114  * image dimensions.
115  */
116  Shape_t fShape; ///< spatial subdimensions
117  Shape_t fStrides; ///< Strides between tensor dimensions (always assume dense, non overlapping tensor)
118  size_t fNDim; ///< Dimension of the tensor (first dimension is the batch size, second is the no. channels)
119  size_t fSize; ///< No. of elements
120  int fDevice; ///< Device associated with current tensor instance
121  int fStreamIndx; ///< Cuda stream associated with current instance
122 
123  std::shared_ptr<TensorDescriptor> fTensorDescriptor;
124  TCudaDeviceBuffer<AFloat> fElementBuffer;
125 
126  MemoryLayout fMemoryLayout;
127 
128 
129 
130 public:
131 
132 
133  //static AFloat * GetOnes() {return fOnes;}
134 
135  TCudaTensor();
136 
137  TCudaTensor(const AFloat * data,
138  const std::vector<size_t> & shape,
139  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
140  int deviceIndx = 0, int streamIndx = 0);
141  TCudaTensor(TCudaDeviceBuffer<AFloat> buffer,
142  const std::vector<size_t> & shape,
143  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
144  int deviceIndx = 0, int streamIndx = 0);
145  TCudaTensor(const std::vector<size_t> & shape,
146  MemoryLayout memlayout = MemoryLayout::ColumnMajor,
147  int deviceIndx = 0, int streamIndx = 0);
148 
149  TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
150  TCudaTensor( (memlayout == MemoryLayout::ColumnMajor) ? Shape_t({ csize, hwsize, bsize}) : Shape_t({ bsize, csize, hwsize }) , memlayout,
151  deviceIndx, streamIndx)
152  {}
153 
154  TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
155 
156  TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
157  {
158  if (memlayout == MemoryLayout::ColumnMajor)
159  *this = TCudaTensor(fElementBuffer, { csize, hsize, wsize, bsize}, memlayout, deviceIndx, streamIndx);
160  }
161 
162  TCudaTensor(size_t n, size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
163  // TCudaTensor( {n,m}, memlayout, deviceIndx, streamIndx) :
164  TCudaTensor( {n, m}, memlayout, deviceIndx, streamIndx)
165  {}
166 
167  TCudaTensor(const TCudaMatrix<AFloat> & m, size_t dim = 2);
168 
169  TCudaTensor(TCudaDeviceBuffer<AFloat> buffer, size_t n, size_t m) :
170  TCudaTensor( buffer, {n,m}, MemoryLayout::ColumnMajor ,0,0) {}
171 
172  TCudaTensor(const TCudaTensor &) = default;
173  TCudaTensor(TCudaTensor &&) = default;
174  TCudaTensor & operator=(const TCudaTensor &) = default;
175  TCudaTensor & operator=( TCudaTensor &&) = default;
176  ~TCudaTensor();
177 
178  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
179  operator TMatrixT<AFloat>() const;
180 
181 
182  MemoryLayout GetLayout() const { return fMemoryLayout; }
183 
184  const Shape_t & GetShape() const {return fShape;}
185  const Shape_t & GetStrides() const {return fStrides;}
186  size_t GetDimAt(size_t i) const {return fShape[i];}
187  size_t GetNDim() const {return fNDim;}
188  size_t GetSize() const {return fSize;}
189 
190  const AFloat * GetDataPointer() const {return fElementBuffer;}
191  AFloat * GetDataPointer() {return fElementBuffer;}
192  const AFloat * GetData() const {return fElementBuffer;}
193  AFloat * GetData() {return fElementBuffer;}
194 
195  const AFloat * GetDataPointerAt(size_t i ) const {
196  return (const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
197  AFloat * GetDataPointerAt(size_t i ) {return fElementBuffer.GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
198 
199 
200  const TCudaDeviceBuffer<AFloat> & GetDeviceBuffer() const {return fElementBuffer;}
201  TCudaDeviceBuffer<AFloat> & GetDeviceBuffer() {return fElementBuffer;}
202 
203 #ifdef R__HAS_CUDNN
204  const cudnnHandle_t & GetCudnnHandle() const {return fCudnnHandle[fStreamIndx];}
205  const cudnnTensorDescriptor_t & GetTensorDescriptor() const {return fTensorDescriptor->fCudnnDesc;}
206  static cudnnDataType_t GetDataType() { return fDataType; }
207 #endif
208 
209  cudaStream_t GetComputeStream() const {
210  return fElementBuffer.GetComputeStream();
211  }
212  void SetComputeStream(cudaStream_t stream) {
213  fElementBuffer.SetComputeStream(stream);
214  }
215 
216  bool isEqual (TCudaTensor<AFloat> & other) {
217 
218  if (fSize != other.GetSize()) return false;
219 
220 
221  std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
222  std::unique_ptr<AFloat[]> hostBufferOther(new AFloat[fSize]);
223  cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
224  cudaMemcpyDeviceToHost);
225  cudaMemcpy(hostBufferOther.get(), other.GetDeviceBuffer(), fSize * sizeof(AFloat),
226  cudaMemcpyDeviceToHost);
227 
228  for (size_t i = 0; i < fSize; i++) {
229  if (hostBufferThis[i] != hostBufferOther[i]) return false;
230  }
231  return true;
232  }
233 
234  bool isEqual (const AFloat * hostBufferOther, size_t otherSize) {
235  if (fSize != otherSize) return false;
236 
237 
238  std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
239  cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
240  cudaMemcpyDeviceToHost);
241 
242  for (size_t i = 0; i < fSize; i++) {
243  if (hostBufferThis[i] != hostBufferOther[i]) return false;
244  }
245 
246  return true;
247  }
248 
249  void Print(const char * name = "Tensor", bool truncate = false) const;
250 
251  void PrintShape(const char * name="Tensor") const;
252 
253  void Zero() {
254  cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetSize());
255  }
256 
257  void SetConstVal(const AFloat constVal) {
258  TCudaHostBuffer<AFloat> hostBuffer(fSize);
259  hostBuffer.SetConstVal(constVal);
260  fElementBuffer.CopyFrom(hostBuffer);
261  }
262 
263  // have this tensor representatrions
264  // 2-dimensional tensors : NW where N is batch size W is the feature size . Memory layout should be columnwise in this case
265  // 3 -dimensional tensor : represnetation is NHWC , tensor should be columnwise storage
266  // 4 -dimensional tensor : representation is NCHW ande tensor should be row wose
267  // a rowmajor tensor with dimension less than trhee should not exist but in case consider as a N, (CHW) for 2d, N, C, (HW) for 3d
268  // a columnmajor tensor for dimension >=4 should not exist but in case consider as a N,H,W,C (i.e. with shape C,W,H,N)
269 
270  size_t GetFirstSize() const {
271  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fShape.front(); } // CM order
272  size_t GetFirstStride() const {
273  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fStrides.front(); } // CM order
274 
275  size_t GetCSize() const {
276  if (fNDim == 2) return 1;
277  return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.front() : fShape[1] ; //assume NHWC
278  }
279  size_t GetHSize() const {
280  if (fNDim == 2) return fShape[0];
281  if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[0] : fShape[1] ;// same as C
282  if (fNDim >= 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[2] : fShape[2] ;
283  return 0;
284  }
285  size_t GetWSize() const {
286  if (fNDim == 2) return fShape[1];
287  if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[1] : fShape[2] ;
288  if (fNDim == 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[3] : fShape[3] ;
289  return 0;
290  }
291 
292  // for backward compatibility (assume column-major
293  // for backward compatibility : for CM tensor (n1,n2,n3,n4) -> ( n1*n2*n3, n4)
294  // for RM tensor (n1,n2,n3,n4) -> ( n2*n3*n4, n1 ) ???
295  size_t GetNrows() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fShape.front();}
296  size_t GetNcols() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fStrides.front(); }
297 
298 
299  // Matrix conversion for tensors of shape 2
300  TCudaMatrix<AFloat> GetMatrix() const {
301  if (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1))
302  return TCudaMatrix<AFloat>(fElementBuffer, GetHSize(), GetWSize());
303 
304 
305  // remember TCudaMatrix is always column-major
306  //case of N,M,1,1,..
307  bool caseNM11 = true;
308  for (size_t i = 2; i < fNDim; ++i) caseNM11 &= fShape[i] == 1;
309  if (caseNM11) {
310  return (GetLayout() == MemoryLayout::ColumnMajor ) ?
311  TCudaMatrix<AFloat>(fElementBuffer, fShape[0], fShape[1]) :
312  TCudaMatrix<AFloat>(fElementBuffer, fShape[1], fShape[0]);
313  }
314  bool case11NM = true;
315  for (size_t i = 0; i < fNDim-2; ++i) case11NM &= fShape[i] == 1;
316  if (case11NM) {
317  return (GetLayout() == MemoryLayout::ColumnMajor ) ?
318  TCudaMatrix<AFloat>(fElementBuffer, fShape[fNDim-2], fShape[fNDim-1]) :
319  TCudaMatrix<AFloat>(fElementBuffer, fShape[fNDim-1], fShape[fNDim-2]);
320  }
321 
322  assert(false);
323  return TCudaMatrix<AFloat>();
324  }
325 
326 
327 
328  static inline std::vector<std::size_t> ComputeStridesFromShape(const std::vector<std::size_t> &shape,
329  bool rowmajorLayout);
330 
331  void ReshapeInPlace(const Shape_t & newShape) {
332  fShape = newShape;
333  fStrides = ComputeStridesFromShape(fShape, fMemoryLayout == MemoryLayout::RowMajor);
334  fNDim = fShape.size();
335  // in principle reshape should not change tensor size
336  size_t newSize = (fMemoryLayout == MemoryLayout::RowMajor) ? fStrides.front() * fShape.front() : fStrides.back() * fShape.back();
337  R__ASSERT(newSize <= fSize);
338  fSize = newSize;
339  // reset the descritor for Cudnn
340  SetTensorDescriptor();
341  }
342 
343  TCudaTensor<AFloat> Reshape(const Shape_t & newShape) const {
344  TCudaTensor<AFloat> tmp(*this);
345  // have a new descriptor for reshaped tensor !!!
346 #ifdef R__HAS_CUDNN
347  tmp.fTensorDescriptor.reset( new TensorDescriptor() );
348  // t.b.d. need to check if we delete the cudnn object
349  CUDNNCHECK(cudnnCreateTensorDescriptor(&(tmp.fTensorDescriptor->fCudnnDesc)));
350 #endif
351  tmp.ReshapeInPlace(newShape);
352  return tmp;
353  }
354 
355  void SetTensorDescriptor();
356 
357  // return slice of tensor
358  // return slices in the first dimension (if row wise) or last dimension if colun wise
359  // so single event slides
360  TCudaTensor<AFloat> At(size_t i) const {
361  Shape_t sliced_shape = (GetLayout() == MemoryLayout::RowMajor)
362  ? Shape_t(fShape.begin() + 1, fShape.end()) :
363  Shape_t(fShape.begin(), fShape.end() - 1);
364 
365 
366  size_t buffsize = (GetLayout() == MemoryLayout::RowMajor) ?
367  fStrides.front() : fStrides.back();
368 
369  size_t offset = i * buffsize;
370 
371  return TCudaTensor<AFloat>((const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(offset, buffsize), sliced_shape, GetLayout());
372  }
373 
374 
375  // element access ( for debugging)
376  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const
377  {
378  // like this works also for multi-dim tensors
379  // and consider the tensor as a multidim one
380  size_t nrows = GetNrows();
381  size_t ncols = GetNcols();
382 
383  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
384  i * ncols + j : j * nrows + i;
385 
386  AFloat * elementPointer = fElementBuffer + offset;
387  return TCudaDeviceReference<AFloat>(elementPointer);
388  }
389  // element access ( for debugging)
390  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k) const
391  {
392  // k is B, i is C, j is HW :
393  assert( fNDim >= 3); // || ( k==0 && fNDim == 2 ) );
394  //note for larger dimension k is all other dims collapsed !!!
395 
396  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
397  i * fStrides[0] + j * fStrides[1] + k :
398  i * fStrides[2] + k * fStrides[1] + j;
399 
400  AFloat * elementPointer = fElementBuffer + offset;
401 
402  return TCudaDeviceReference<AFloat>(elementPointer);
403  }
404 
405  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k, size_t l) const
406  {
407  // for rowsise
408  //assert(GetLayout() == MemoryLayout::RowMajor);
409  assert( fNDim == 4); // || ( k==0 && fNDim == 2 ) );
410 
411  size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
412  i * fStrides[0] + j * fStrides[1] + k * fStrides[2] + l:
413  l * fStrides[3] + k * fStrides[2] + j * fStrides[1] + i;
414 
415  AFloat * elementPointer = fElementBuffer + offset;
416 
417  return TCudaDeviceReference<AFloat>(elementPointer);
418  }
419 
420 
421 
422 
423 private:
424 
425  /** Initializes all shared devices resource and makes sure that a sufficient
426  * number of curand states are allocated on the device and initialized as
427  * well as that the one-vector for the summation over columns has the right
428  * size. */
429  void InitializeCuda();
430  void InitializeCurandStates();
431 
432 };
433 
434 
435 
436 
437 } // namespace DNN
438 } // namespace TMVA
439 
440 #endif