Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
CudaMatrix.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21 
22 #include "cuda.h"
23 #include "cuda_runtime.h"
24 #include "cublas_v2.h"
25 #include "curand_kernel.h"
26 
27 #include "TMatrixT.h"
28 #include "CudaBuffers.h"
29 
30 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
31 
32 namespace TMVA {
33 namespace DNN {
34 
35 /** Function to check cuda return code. Taken from
36  * http://stackoverflow.com/questions/14038589/
37  */
38 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
39 
40 //____________________________________________________________________________
41 //
42 // Cuda Device Reference
43 //____________________________________________________________________________
44 
45 /** TCudaDeviceReference
46  *
47  * Helper class emulating lvalue references for AFloat values that are
48  * physically on the device. Allows for example to assign to matrix elements.
49  * Note that device access through CudaDeviceReferences enforces synchronization
50  * with all streams and thus qualifies as performance killer. Only used for
51  * testing.
52  */
53 template<typename AFloat>
54 class TCudaDeviceReference
55 {
56 private:
57 
58  AFloat * fDevicePointer;
59 
60 public:
61 
62  TCudaDeviceReference(AFloat * devicePointer);
63 
64  operator AFloat();
65 
66  void operator=(const TCudaDeviceReference &other);
67  void operator=(AFloat value);
68  void operator+=(AFloat value);
69  void operator-=(AFloat value);
70 };
71 
72 //____________________________________________________________________________
73 //
74 // Cuda Matrix
75 //____________________________________________________________________________
76 
77 /** TCudaMatrix Class
78  *
79  * The TCudaMatrix class represents matrices on a CUDA device. The elements
80  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
81  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
82  * object, that means on assignment and copy creation only a shallow copy is
83  * performed and no new element buffer allocated. To perform a deep copy use
84  * the static Copy method of the TCuda architecture class.
85  *
86  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
87  * transferred to the device. This stream can be accessed through the
88  * GetComputeStream member function and used to synchronize computations.
89  *
90  * The TCudaMatrix class also holds static references to CUDA resources.
91  * Those are the cublas handle, a buffer of curand states for the generation
92  * of random numbers as well as a vector containing ones, which is used for
93  * summing column matrices using matrix-vector multiplication. The class also
94  * has a static buffer for returning results from the device.
95  *
96  */
97 template<typename AFloat>
98 class TCudaMatrix
99 {
100 public:
101 
102 private:
103 
104  static size_t fInstances; ///< Current number of matrix instances.
105  static cublasHandle_t fCublasHandle;
106  static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
107  static AFloat * fOnes; ///< Vector used for summations of columns.
108  static size_t fNOnes; ///< Current length of the one vector.
109  static curandState_t * fCurandStates;
110  static size_t fNCurandStates;
111 
112 
113  size_t fNRows;
114  size_t fNCols;
115  TCudaDeviceBuffer<AFloat> fElementBuffer;
116 
117 public:
118 
119  static Bool_t gInitializeCurand;
120 
121  static AFloat * GetOnes() {return fOnes;}
122 
123  TCudaMatrix();
124  TCudaMatrix(size_t i, size_t j);
125  TCudaMatrix(const TMatrixT<AFloat> &);
126  TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
127 
128  TCudaMatrix(const TCudaMatrix &) = default;
129  TCudaMatrix( TCudaMatrix &&) = default;
130  TCudaMatrix & operator=(const TCudaMatrix &) = default;
131  TCudaMatrix & operator=( TCudaMatrix &&) = default;
132  ~TCudaMatrix() = default;
133 
134  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
135  operator TMatrixT<AFloat>() const;
136 
137  inline cudaStream_t GetComputeStream() const;
138  inline void SetComputeStream(cudaStream_t stream);
139  /** Set the return buffer on the device to the specified value. This is
140  * required for example for reductions in order to initialize the
141  * accumulator. */
142  inline static void ResetDeviceReturn(AFloat value = 0.0);
143  /** Transfer the value in the device return buffer to the host. This
144  * tranfer is synchronous */
145  inline static AFloat GetDeviceReturn();
146  /** Return device pointer to the device return buffer */
147  inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
148  inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
149 
150  /** Blocking synchronization with the associated compute stream, if it's
151  * not the default stream. */
152  inline void Synchronize(const TCudaMatrix &) const;
153 
154  static size_t GetNDim() {return 2;}
155  size_t GetNrows() const {return fNRows;}
156  size_t GetNcols() const {return fNCols;}
157  size_t GetNoElements() const {return fNRows * fNCols;}
158 
159  const AFloat * GetDataPointer() const {return fElementBuffer;}
160  AFloat * GetDataPointer() {return fElementBuffer;}
161  const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
162 
163  inline TCudaDeviceBuffer<AFloat> GetDeviceBuffer() const { return fElementBuffer;}
164 
165  /** Access to elements of device matrices provided through TCudaDeviceReference
166  * class. Note that access is synchronous end enforces device synchronization
167  * on all streams. Only used for testing. */
168  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
169 
170  void Print() const {
171  TMatrixT<AFloat> mat(*this);
172  mat.Print();
173  }
174 
175  void Zero() {
176  cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
177  }
178 
179 
180 private:
181 
182  /** Initializes all shared devices resource and makes sure that a sufficient
183  * number of curand states are allocated on the device and initialized as
184  * well as that the one-vector for the summation over columns has the right
185  * size. */
186  void InitializeCuda();
187  void InitializeCurandStates();
188 
189 };
190 
191 //
192 // Inline Functions.
193 //______________________________________________________________________________
194 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
195 {
196  if (code != cudaSuccess)
197  {
198  fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
199  if (abort) exit(code);
200  }
201 }
202 
203 //______________________________________________________________________________
204 template<typename AFloat>
205 TCudaDeviceReference<AFloat>::TCudaDeviceReference(AFloat * devicePointer)
206  : fDevicePointer(devicePointer)
207 {
208  // Nothing to do here.
209 }
210 
211 //______________________________________________________________________________
212 template<typename AFloat>
213 TCudaDeviceReference<AFloat>::operator AFloat()
214 {
215  AFloat buffer;
216  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
217  cudaMemcpyDeviceToHost);
218  return buffer;
219 }
220 
221 //______________________________________________________________________________
222 template<typename AFloat>
223 void TCudaDeviceReference<AFloat>::operator=(const TCudaDeviceReference &other)
224 {
225  cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
226  cudaMemcpyDeviceToDevice);
227 }
228 
229 //______________________________________________________________________________
230 template<typename AFloat>
231 void TCudaDeviceReference<AFloat>::operator=(AFloat value)
232 {
233  AFloat buffer = value;
234  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
235  cudaMemcpyHostToDevice);
236 }
237 
238 //______________________________________________________________________________
239 template<typename AFloat>
240 void TCudaDeviceReference<AFloat>::operator+=(AFloat value)
241 {
242  AFloat buffer;
243  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
244  cudaMemcpyDeviceToHost);
245  buffer += value;
246  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
247  cudaMemcpyHostToDevice);
248 }
249 
250 //______________________________________________________________________________
251 template<typename AFloat>
252 void TCudaDeviceReference<AFloat>::operator-=(AFloat value)
253 {
254  AFloat buffer;
255  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
256  cudaMemcpyDeviceToHost);
257  buffer -= value;
258  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
259  cudaMemcpyHostToDevice);
260 }
261 
262 //______________________________________________________________________________
263 template<typename AFloat>
264 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
265 {
266  return fElementBuffer.GetComputeStream();
267 }
268 
269 //______________________________________________________________________________
270 template<typename AFloat>
271 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
272 {
273  return fElementBuffer.SetComputeStream(stream);
274 }
275 
276 //______________________________________________________________________________
277 template<typename AFloat>
278 inline void TCudaMatrix<AFloat>::Synchronize(const TCudaMatrix &A) const
279 {
280  cudaEvent_t event;
281  cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
282  cudaEventRecord(event, A.GetComputeStream());
283  cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
284  cudaEventDestroy(event);
285 }
286 
287 //______________________________________________________________________________
288 template<typename AFloat>
289 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
290 {
291  AFloat buffer = value;
292  cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
293 }
294 
295 //______________________________________________________________________________
296 template<typename AFloat>
297 inline AFloat TCudaMatrix<AFloat>::GetDeviceReturn()
298 {
299  AFloat buffer;
300  cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
301  return buffer;
302 }
303 
304 //______________________________________________________________________________
305 template<typename AFloat>
306 TCudaDeviceReference<AFloat> TCudaMatrix<AFloat>::operator()(size_t i, size_t j) const
307 {
308  AFloat * elementPointer = fElementBuffer;
309  elementPointer += j * fNRows + i;
310  return TCudaDeviceReference<AFloat>(elementPointer);
311 }
312 
313 } // namespace DNN
314 } // namespace TMVA
315 
316 #endif