27 #ifndef TMVA_DNN_GENERALLAYER
28 #define TMVA_DNN_GENERALLAYER
48 template <
typename Architecture_t>
51 using Tensor_t =
typename Architecture_t::Tensor_t;
52 using Matrix_t =
typename Architecture_t::Matrix_t;
53 using Scalar_t =
typename Architecture_t::Scalar_t;
68 std::vector<Matrix_t> fWeights;
69 std::vector<Matrix_t> fBiases;
71 std::vector<Matrix_t> fWeightGradients;
72 std::vector<Matrix_t> fBiasGradients;
75 Tensor_t fActivationGradients;
77 EInitialization fInit;
81 VGeneralLayer(
size_t BatchSize,
size_t InputDepth,
size_t InputHeight,
size_t InputWidth,
size_t Depth,
82 size_t Height,
size_t Width,
size_t WeightsNSlices,
size_t WeightsNRows,
size_t WeightsNCols,
83 size_t BiasesNSlices,
size_t BiasesNRows,
size_t BiasesNCols,
size_t OutputNSlices,
size_t OutputNRows,
84 size_t OutputNCols, EInitialization Init);
87 VGeneralLayer(
size_t BatchSize,
size_t InputDepth,
size_t InputHeight,
size_t InputWidth,
size_t Depth,
88 size_t Height,
size_t Width,
size_t WeightsNSlices, std::vector<size_t> WeightsNRows,
89 std::vector<size_t> WeightsNCols,
size_t BiasesNSlices, std::vector<size_t> BiasesNRows,
90 std::vector<size_t> BiasesNCols,
size_t OutputNSlices,
size_t OutputNRows,
size_t OutputNCols,
91 EInitialization Init);
94 VGeneralLayer(VGeneralLayer<Architecture_t> *layer);
97 VGeneralLayer(
const VGeneralLayer &);
100 virtual ~VGeneralLayer();
103 virtual void Initialize();
108 virtual void Forward(Tensor_t &input,
bool applyDropout =
false) = 0;
112 virtual void Backward(Tensor_t &gradients_backward,
const Tensor_t &activations_backward ) = 0;
118 virtual void ResetTraining() {}
121 void Update(
const Scalar_t learningRate);
124 void UpdateWeights(
const std::vector<Matrix_t> &weightGradients,
const Scalar_t learningRate);
127 void UpdateBiases(
const std::vector<Matrix_t> &biasGradients,
const Scalar_t learningRate);
130 void UpdateWeightGradients(
const std::vector<Matrix_t> &weightGradients,
const Scalar_t learningRate);
133 void UpdateBiasGradients(
const std::vector<Matrix_t> &biasGradients,
const Scalar_t learningRate);
136 void CopyWeights(
const std::vector<Matrix_t> &otherWeights);
139 void CopyBiases(
const std::vector<Matrix_t> &otherBiases);
144 template <
typename Arch>
145 void CopyParameters(
const VGeneralLayer<Arch> &layer);
148 virtual void Print()
const = 0;
151 virtual void AddWeightsXMLTo(
void *parent) = 0;
154 virtual void ReadWeightsFromXML(
void *parent) = 0;
157 virtual void SetDropoutProbability(Scalar_t ) {}
160 size_t GetBatchSize()
const {
return fBatchSize; }
161 size_t GetInputDepth()
const {
return fInputDepth; }
162 size_t GetInputHeight()
const {
return fInputHeight; }
163 size_t GetInputWidth()
const {
return fInputWidth; }
164 size_t GetDepth()
const {
return fDepth; }
165 size_t GetHeight()
const {
return fHeight; }
166 size_t GetWidth()
const {
return fWidth; }
167 bool IsTraining()
const {
return fIsTraining; }
169 const std::vector<Matrix_t> &GetWeights()
const {
return fWeights; }
170 std::vector<Matrix_t> &GetWeights() {
return fWeights; }
172 const Matrix_t &GetWeightsAt(
size_t i)
const {
return fWeights[i]; }
173 Matrix_t &GetWeightsAt(
size_t i) {
return fWeights[i]; }
175 const std::vector<Matrix_t> &GetBiases()
const {
return fBiases; }
176 std::vector<Matrix_t> &GetBiases() {
return fBiases; }
178 const Matrix_t &GetBiasesAt(
size_t i)
const {
return fBiases[i]; }
179 Matrix_t &GetBiasesAt(
size_t i) {
return fBiases[i]; }
181 const std::vector<Matrix_t> &GetWeightGradients()
const {
return fWeightGradients; }
182 std::vector<Matrix_t> &GetWeightGradients() {
return fWeightGradients; }
184 const Matrix_t &GetWeightGradientsAt(
size_t i)
const {
return fWeightGradients[i]; }
185 Matrix_t &GetWeightGradientsAt(
size_t i) {
return fWeightGradients[i]; }
187 const std::vector<Matrix_t> &GetBiasGradients()
const {
return fBiasGradients; }
188 std::vector<Matrix_t> &GetBiasGradients() {
return fBiasGradients; }
190 const Matrix_t &GetBiasGradientsAt(
size_t i)
const {
return fBiasGradients[i]; }
191 Matrix_t &GetBiasGradientsAt(
size_t i) {
return fBiasGradients[i]; }
193 const Tensor_t &GetOutput()
const {
return fOutput; }
194 Tensor_t &GetOutput() {
return fOutput; }
196 const Tensor_t &GetActivationGradients()
const {
return fActivationGradients; }
197 Tensor_t &GetActivationGradients() {
return fActivationGradients; }
199 Matrix_t GetOutputAt(
size_t i) {
return fOutput.At(i).GetMatrix(); }
200 const Matrix_t &GetOutputAt(
size_t i)
const {
return fOutput.At(i).GetMatrix(); }
202 Matrix_t GetActivationGradientsAt(
size_t i) {
return fActivationGradients.At(i).GetMatrix(); }
203 const Matrix_t &GetActivationGradientsAt(
size_t i)
const {
return fActivationGradients.At(i).GetMatrix(); }
207 virtual std::vector<Matrix_t> GetExtraLayerParameters()
const {
return std::vector<Matrix_t>(); }
209 virtual void SetExtraLayerParameters(
const std::vector<Matrix_t> & ) {}
211 EInitialization GetInitialization()
const {
return fInit; }
214 void SetBatchSize(
size_t batchSize) { fBatchSize = batchSize; }
215 void SetInputDepth(
size_t inputDepth) { fInputDepth = inputDepth; }
216 void SetInputHeight(
size_t inputHeight) { fInputHeight = inputHeight; }
217 void SetInputWidth(
size_t inputWidth) { fInputWidth = inputWidth; }
218 void SetDepth(
size_t depth) { fDepth = depth; }
219 void SetHeight(
size_t height) { fHeight = height; }
220 void SetWidth(
size_t width) { fWidth = width; }
221 void SetIsTraining(
bool isTraining) { fIsTraining = isTraining; }
224 void WriteTensorToXML(
void * node,
const char * name,
const std::vector<Matrix_t> & tensor);
225 void WriteMatrixToXML(
void * node,
const char * name,
const Matrix_t & matrix);
227 void ReadMatrixXML(
void * node,
const char * name, Matrix_t & matrix);
235 template <
typename Architecture_t>
236 VGeneralLayer<Architecture_t>::VGeneralLayer(
size_t batchSize,
size_t inputDepth,
size_t inputHeight,
size_t inputWidth,
237 size_t depth,
size_t height,
size_t width,
size_t weightsNSlices,
238 size_t weightsNRows,
size_t weightsNCols,
size_t biasesNSlices,
239 size_t biasesNRows,
size_t biasesNCols,
size_t outputNSlices,
240 size_t outputNRows,
size_t outputNCols, EInitialization init)
241 : fBatchSize(batchSize), fInputDepth(inputDepth), fInputHeight(inputHeight), fInputWidth(inputWidth), fDepth(depth),
242 fHeight(height), fWidth(width), fIsTraining(true), fWeights(), fBiases(), fWeightGradients(), fBiasGradients(),
243 fOutput( outputNSlices, outputNRows, outputNCols ),
244 fActivationGradients( outputNSlices, outputNRows, outputNCols ),
248 for (
size_t i = 0; i < weightsNSlices; i++) {
249 fWeights.emplace_back(weightsNRows, weightsNCols);
250 fWeightGradients.emplace_back(weightsNRows, weightsNCols);
253 for (
size_t i = 0; i < biasesNSlices; i++) {
254 fBiases.emplace_back(biasesNRows, biasesNCols);
255 fBiasGradients.emplace_back(biasesNRows, biasesNCols);
260 template <
typename Architecture_t>
261 VGeneralLayer<Architecture_t>::VGeneralLayer(
size_t batchSize,
size_t inputDepth,
size_t inputHeight,
size_t inputWidth,
262 size_t depth,
size_t height,
size_t width,
size_t weightsNSlices,
263 std::vector<size_t> weightsNRows, std::vector<size_t> weightsNCols,
264 size_t biasesNSlices, std::vector<size_t> biasesNRows,
265 std::vector<size_t> biasesNCols,
size_t outputNSlices,
size_t outputNRows,
266 size_t outputNCols, EInitialization init)
267 : fBatchSize(batchSize), fInputDepth(inputDepth), fInputHeight(inputHeight), fInputWidth(inputWidth), fDepth(depth),
268 fHeight(height), fWidth(width), fIsTraining(true), fWeights(), fBiases(), fWeightGradients(), fBiasGradients(),
269 fOutput( outputNSlices, outputNRows, outputNCols ),
270 fActivationGradients( outputNSlices, outputNRows, outputNCols ),
274 for (
size_t i = 0; i < weightsNSlices; i++) {
275 fWeights.emplace_back(weightsNRows[i], weightsNCols[i]);
276 fWeightGradients.emplace_back(weightsNRows[i], weightsNCols[i]);
279 for (
size_t i = 0; i < biasesNSlices; i++) {
280 fBiases.emplace_back(biasesNRows[i], biasesNCols[i]);
281 fBiasGradients.emplace_back(biasesNRows[i], biasesNCols[i]);
291 template <
typename Architecture_t>
292 VGeneralLayer<Architecture_t>::VGeneralLayer(VGeneralLayer<Architecture_t> *layer)
293 : fBatchSize(layer->GetBatchSize()), fInputDepth(layer->GetInputDepth()), fInputHeight(layer->GetInputHeight()),
294 fInputWidth(layer->GetInputWidth()), fDepth(layer->GetDepth()), fHeight(layer->GetHeight()),
295 fWidth(layer->GetWidth()), fIsTraining(layer->IsTraining()), fWeights(), fBiases(), fWeightGradients(),
297 fOutput( layer->GetOutput().GetShape() ),
298 fActivationGradients( layer->GetActivationGradients().GetShape() ),
299 fInit(layer->GetInitialization() )
302 size_t weightsNSlices = (layer->GetWeights()).size();
303 size_t weightsNRows = 0;
304 size_t weightsNCols = 0;
306 for (
size_t i = 0; i < weightsNSlices; i++) {
307 weightsNRows = (layer->GetWeightsAt(i)).GetNrows();
308 weightsNCols = (layer->GetWeightsAt(i)).GetNcols();
310 fWeights.emplace_back(weightsNRows, weightsNCols);
311 fWeightGradients.emplace_back(weightsNRows, weightsNCols);
313 Architecture_t::Copy(fWeights[i], layer->GetWeightsAt(i));
316 size_t biasesNSlices = (layer->GetBiases()).size();
317 size_t biasesNRows = 0;
318 size_t biasesNCols = 0;
320 for (
size_t i = 0; i < biasesNSlices; i++) {
321 biasesNRows = (layer->GetBiasesAt(i)).GetNrows();
322 biasesNCols = (layer->GetBiasesAt(i)).GetNcols();
324 fBiases.emplace_back(biasesNRows, biasesNCols);
325 fBiasGradients.emplace_back(biasesNRows, biasesNCols);
327 Architecture_t::Copy(fBiases[i], layer->GetBiasesAt(i));
332 template <
typename Architecture_t>
333 VGeneralLayer<Architecture_t>::VGeneralLayer(
const VGeneralLayer &layer)
334 : fBatchSize(layer.fBatchSize), fInputDepth(layer.fInputDepth), fInputHeight(layer.fInputHeight),
335 fInputWidth(layer.fInputWidth), fDepth(layer.fDepth), fHeight(layer.fHeight), fWidth(layer.fWidth),
336 fIsTraining(layer.fIsTraining), fWeights(), fBiases(), fWeightGradients(), fBiasGradients(),
337 fOutput( layer.GetOutput() ),
338 fActivationGradients( layer.GetActivationGradients() ),
339 fInit( layer.GetInitialization())
342 size_t weightsNSlices = layer.fWeights.size();
343 size_t weightsNRows = 0;
344 size_t weightsNCols = 0;
346 for (
size_t i = 0; i < weightsNSlices; i++) {
347 weightsNRows = (layer.fWeights[i]).GetNrows();
348 weightsNCols = (layer.fWeights[i]).GetNcols();
350 fWeights.emplace_back(weightsNRows, weightsNCols);
351 fWeightGradients.emplace_back(weightsNRows, weightsNCols);
353 Architecture_t::Copy(fWeights[i], layer.fWeights[i]);
356 size_t biasesNSlices = layer.fBiases.size();
357 size_t biasesNRows = 0;
358 size_t biasesNCols = 0;
360 for (
size_t i = 0; i < biasesNSlices; i++) {
361 biasesNRows = (layer.fBiases[i]).GetNrows();
362 biasesNCols = (layer.fBiases[i]).GetNcols();
364 fBiases.emplace_back(biasesNRows, biasesNCols);
365 fBiasGradients.emplace_back(biasesNRows, biasesNCols);
367 Architecture_t::Copy(fBiases[i], layer.fBiases[i]);
370 size_t outputNSlices = layer.fOutput.size();
371 size_t outputNRows = 0;
372 size_t outputNCols = 0;
374 for (
size_t i = 0; i < outputNSlices; i++) {
375 outputNRows = (layer.fOutput[i]).GetNrows();
376 outputNCols = (layer.fOutput[i]).GetNcols();
378 fOutput.emplace_back(outputNRows, outputNCols);
379 fActivationGradients.emplace_back(outputNRows, outputNCols);
384 template <
typename Architecture_t>
385 VGeneralLayer<Architecture_t>::~VGeneralLayer()
391 template <
typename Architecture_t>
392 auto VGeneralLayer<Architecture_t>::Initialize() ->
void
394 for (
size_t i = 0; i < fWeights.size(); i++) {
395 initialize<Architecture_t>(fWeights[i], this->GetInitialization());
396 initialize<Architecture_t>(fWeightGradients[i], EInitialization::kZero);
399 for (
size_t i = 0; i < fBiases.size(); i++) {
400 initialize<Architecture_t>(fBiases[i], EInitialization::kZero);
401 initialize<Architecture_t>(fBiasGradients[i], EInitialization::kZero);
406 template <
typename Architecture_t>
407 auto VGeneralLayer<Architecture_t>::Update(
const Scalar_t learningRate) ->
void
409 this->UpdateWeights(fWeightGradients, learningRate);
410 this->UpdateBiases(fBiasGradients, learningRate);
414 template <
typename Architecture_t>
415 auto VGeneralLayer<Architecture_t>::UpdateWeights(
const std::vector<Matrix_t> &weightGradients,
416 const Scalar_t learningRate) ->
void
418 for (
size_t i = 0; i < fWeights.size(); i++) {
419 Architecture_t::ScaleAdd(fWeights[i], weightGradients[i], -learningRate);
424 template <
typename Architecture_t>
425 auto VGeneralLayer<Architecture_t>::UpdateBiases(
const std::vector<Matrix_t> &biasGradients,
426 const Scalar_t learningRate) ->
void
428 for (
size_t i = 0; i < fBiases.size(); i++) {
429 Architecture_t::ScaleAdd(fBiases[i], biasGradients[i], -learningRate);
434 template <
typename Architecture_t>
435 auto VGeneralLayer<Architecture_t>::UpdateWeightGradients(
const std::vector<Matrix_t> &weightGradients,
436 const Scalar_t learningRate) ->
void
438 for (
size_t i = 0; i < fWeightGradients.size(); i++) {
439 Architecture_t::ScaleAdd(fWeightGradients[i], weightGradients[i], -learningRate);
444 template <
typename Architecture_t>
445 auto VGeneralLayer<Architecture_t>::UpdateBiasGradients(
const std::vector<Matrix_t> &biasGradients,
446 const Scalar_t learningRate) ->
void
448 for (
size_t i = 0; i < fBiasGradients.size(); i++) {
449 Architecture_t::ScaleAdd(fBiasGradients[i], biasGradients[i], -learningRate);
454 template <
typename Architecture_t>
455 auto VGeneralLayer<Architecture_t>::CopyWeights(
const std::vector<Matrix_t> &otherWeights) ->
void
458 for (
size_t i = 0; i < fWeights.size(); i++) {
459 Architecture_t::Copy(fWeights[i], otherWeights[i]);
464 template <
typename Architecture_t>
465 auto VGeneralLayer<Architecture_t>::CopyBiases(
const std::vector<Matrix_t> &otherBiases) ->
void
467 for (
size_t i = 0; i < fBiases.size(); i++) {
468 Architecture_t::Copy(fBiases[i], otherBiases[i]);
473 template <
typename Architecture_t>
474 template <
typename Arch>
475 void VGeneralLayer<Architecture_t>::CopyParameters(
const VGeneralLayer<Arch> &layer)
479 Architecture_t::CopyDiffArch(this->GetWeights(), layer.GetWeights());
480 Architecture_t::CopyDiffArch(this->GetBiases(), layer.GetBiases());
483 auto params = layer.GetExtraLayerParameters();
484 if (params.size() > 0) {
485 auto paramsToCopy = GetExtraLayerParameters();
486 Architecture_t::CopyDiffArch(paramsToCopy, params );
487 SetExtraLayerParameters(paramsToCopy);
492 template <
typename Architecture_t>
493 auto VGeneralLayer<Architecture_t>::WriteTensorToXML(
void * node,
const char * name,
const std::vector<Matrix_t> & tensor) ->
void
495 auto xmlengine = gTools().xmlengine();
496 void* matnode = xmlengine.NewChild(node, 0, name);
497 if (tensor.size() == 0)
return;
498 xmlengine.NewAttr(matnode,0,
"Depth", gTools().StringFromInt(tensor.size()) );
500 xmlengine.NewAttr(matnode,0,
"Rows", gTools().StringFromInt(tensor[0].GetNrows()) );
501 xmlengine.NewAttr(matnode,0,
"Columns", gTools().StringFromInt(tensor[0].GetNcols()) );
503 for (
size_t i = 0; i < tensor.size(); ++i) {
504 auto & mat = tensor[i];
505 for (Int_t row = 0; row < mat.GetNrows(); row++) {
506 for (Int_t col = 0; col < mat.GetNcols(); col++) {
507 TString tmp = TString::Format(
"%5.15e ", (mat)(row,col) );
512 xmlengine.AddRawLine( matnode, s.str().c_str() );
516 template <
typename Architecture_t>
517 auto VGeneralLayer<Architecture_t>::WriteMatrixToXML(
void * node,
const char * name,
const Matrix_t & matrix) ->
void
519 auto xmlengine = gTools().xmlengine();
520 void* matnode = xmlengine.NewChild(node, 0, name);
522 xmlengine.NewAttr(matnode,0,
"Rows", gTools().StringFromInt(matrix.GetNrows()) );
523 xmlengine.NewAttr(matnode,0,
"Columns", gTools().StringFromInt(matrix.GetNcols()) );
525 s.precision( std::numeric_limits<Scalar_t>::digits10 );
526 size_t nrows = matrix.GetNrows();
527 size_t ncols = matrix.GetNcols();
528 for (
size_t row = 0; row < nrows; row++) {
529 for (
size_t col = 0; col < ncols; col++) {
531 s << std::scientific << matrix(row,col) <<
" ";
535 xmlengine.AddRawLine( matnode, s.str().c_str() );
539 template <
typename Architecture_t>
540 auto VGeneralLayer<Architecture_t>::ReadMatrixXML(
void * node,
const char * name, Matrix_t & matrix) ->
void
542 void *matrixXML = gTools().GetChild(node, name);
544 gTools().ReadAttr(matrixXML,
"Rows", rows);
545 gTools().ReadAttr(matrixXML,
"Columns", cols);
547 R__ASSERT((
size_t) matrix.GetNrows() == rows);
548 R__ASSERT((
size_t) matrix.GetNcols() == cols);
550 TMatrixT<Scalar_t> tmatrix(rows, cols);
552 const char * matrixString = gTools().xmlengine().GetNodeContent(matrixXML);
553 std::stringstream matrixStringStream(matrixString);
555 for (
size_t i = 0; i < rows; i++)
557 for (
size_t j = 0; j < cols; j++)
559 #ifndef R__HAS_TMVAGPU
560 matrixStringStream >> tmatrix(i,j);
563 matrixStringStream >> value;
564 tmatrix(i,j) = value;
571 Matrix_t tmp( tmatrix);
572 Architecture_t::Copy(matrix, tmp);
577 template <
typename Architecture>
578 auto debugTensor(
const typename Architecture::Tensor_t & A,
const std::string name =
"tensor") ->
void
580 Architecture::PrintTensor(A,name);