Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
NeuralNet.icc
Go to the documentation of this file.
1 #ifndef TMVA_NEURAL_NET_I
2 #define TMVA_NEURAL_NET_I
3 
4 #ifndef TMVA_NEURAL_NET
5 #error "Do not use NeuralNet.icc directly. #include \"NeuralNet.h\" instead."
6 #endif // TMVA_NEURAL_NET
7 #pragma once
8 #ifndef _MSC_VER
9 #pragma GCC diagnostic ignored "-Wunused-variable"
10 #endif
11 
12 #include "Math/Util.h"
13 
14 #include "TMVA/Pattern.h"
15 #include "TMVA/MethodBase.h"
16 
17 #include <tuple>
18 #include <future>
19 #include <random>
20 
21 namespace TMVA
22 {
23  namespace DNN
24  {
25 
26 
27 
28 
29 
30 
31 
32 
33  template <typename T>
34  T uniformFromTo (T from, T to)
35  {
36  return from + (rand ()* (to - from)/RAND_MAX);
37  }
38 
39 
40 
41  template <typename Container, typename T>
42  void uniformDouble (Container& container, T maxValue)
43  {
44  for (auto it = begin (container), itEnd = end (container); it != itEnd; ++it)
45  {
46 // (*it) = uniformFromTo (-1.0*maxValue, 1.0*maxValue);
47  (*it) = TMVA::DNN::uniformFromTo (-1.0*maxValue, 1.0*maxValue);
48  }
49  }
50 
51 
52  extern std::shared_ptr<std::function<double(double)>> ZeroFnc;
53 
54 
55  extern std::shared_ptr<std::function<double(double)>> Sigmoid;
56  extern std::shared_ptr<std::function<double(double)>> InvSigmoid;
57 
58  extern std::shared_ptr<std::function<double(double)>> Tanh;
59  extern std::shared_ptr<std::function<double(double)>> InvTanh;
60 
61  extern std::shared_ptr<std::function<double(double)>> Linear;
62  extern std::shared_ptr<std::function<double(double)>> InvLinear;
63 
64  extern std::shared_ptr<std::function<double(double)>> SymmReLU;
65  extern std::shared_ptr<std::function<double(double)>> InvSymmReLU;
66 
67  extern std::shared_ptr<std::function<double(double)>> ReLU;
68  extern std::shared_ptr<std::function<double(double)>> InvReLU;
69 
70  extern std::shared_ptr<std::function<double(double)>> SoftPlus;
71  extern std::shared_ptr<std::function<double(double)>> InvSoftPlus;
72 
73  extern std::shared_ptr<std::function<double(double)>> TanhShift;
74  extern std::shared_ptr<std::function<double(double)>> InvTanhShift;
75 
76  extern std::shared_ptr<std::function<double(double)>> SoftSign;
77  extern std::shared_ptr<std::function<double(double)>> InvSoftSign;
78 
79  extern std::shared_ptr<std::function<double(double)>> Gauss;
80  extern std::shared_ptr<std::function<double(double)>> InvGauss;
81 
82  extern std::shared_ptr<std::function<double(double)>> GaussComplement;
83  extern std::shared_ptr<std::function<double(double)>> InvGaussComplement;
84 
85 
86 /*! \brief apply weights using drop-out; for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
87  *
88  * itDrop correlates with itSourceBegin
89  */
90 template <bool HasDropOut, typename ItSource, typename ItWeight, typename ItTarget, typename ItDrop>
91  void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd,
92  ItWeight itWeight,
93  ItTarget itTargetBegin, ItTarget itTargetEnd,
94  ItDrop itDrop)
95  {
96  for (auto itSource = itSourceBegin; itSource != itSourceEnd; ++itSource)
97  {
98  for (auto itTarget = itTargetBegin; itTarget != itTargetEnd; ++itTarget)
99  {
100  if (!HasDropOut || *itDrop)
101  (*itTarget) += (*itSource) * (*itWeight);
102  ++itWeight;
103  }
104  if (HasDropOut) ++itDrop;
105  }
106  }
107 
108 
109 
110 
111 
112 
113 /*! \brief apply weights backwards (for backprop); for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
114  *
115  * itDrop correlates with itPrev (to be in agreement with "applyWeights" where it correlates with itSources (same node as itTarget here in applyBackwards)
116  */
117 template <bool HasDropOut, typename ItSource, typename ItWeight, typename ItPrev, typename ItDrop>
118  void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd,
119  ItWeight itWeight,
120  ItPrev itPrevBegin, ItPrev itPrevEnd,
121  ItDrop itDrop)
122  {
123  for (auto itPrev = itPrevBegin; itPrev != itPrevEnd; ++itPrev)
124  {
125  for (auto itCurr = itCurrBegin; itCurr != itCurrEnd; ++itCurr)
126  {
127  if (!HasDropOut || *itDrop)
128  (*itPrev) += (*itCurr) * (*itWeight);
129  ++itWeight;
130  }
131  if (HasDropOut) ++itDrop;
132  }
133  }
134 
135 
136 
137 
138 
139 
140 
141 /*! \brief apply the activation functions
142  *
143  *
144  */
145 
146  template <typename ItValue, typename Fnc>
147  void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc)
148  {
149  while (itValue != itValueEnd)
150  {
151  auto& value = (*itValue);
152  value = (*fnc.get ()) (value);
153 
154  ++itValue;
155  }
156  }
157 
158 
159 /*! \brief apply the activation functions and compute the gradient
160  *
161  *
162  */
163  template <typename ItValue, typename Fnc, typename InvFnc, typename ItGradient>
164  void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc, InvFnc invFnc, ItGradient itGradient)
165  {
166  while (itValue != itValueEnd)
167  {
168  auto& value = (*itValue);
169  value = (*fnc.get ()) (value);
170  (*itGradient) = (*invFnc.get ()) (value);
171 
172  ++itValue; ++itGradient;
173  }
174  }
175 
176 
177 
178 /*! \brief update the gradients
179  *
180  *
181  */
182  template <typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient>
183  void update (ItSource itSource, ItSource itSourceEnd,
184  ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd,
185  ItTargetGradient itTargetGradientBegin,
186  ItGradient itGradient)
187  {
188  while (itSource != itSourceEnd)
189  {
190  auto itTargetDelta = itTargetDeltaBegin;
191  auto itTargetGradient = itTargetGradientBegin;
192  while (itTargetDelta != itTargetDeltaEnd)
193  {
194  (*itGradient) -= (*itTargetDelta) * (*itSource) * (*itTargetGradient);
195  ++itTargetDelta; ++itTargetGradient; ++itGradient;
196  }
197  ++itSource;
198  }
199  }
200 
201 
202 
203 
204 /*! \brief compute the regularization (L1, L2)
205  *
206  *
207  */
208  template <EnumRegularization Regularization>
209  inline double computeRegularization (double weight, const double& factorWeightDecay)
210  {
211  MATH_UNUSED(weight);
212  MATH_UNUSED(factorWeightDecay);
213 
214  return 0;
215  }
216 
217 // L1 regularization
218  template <>
219  inline double computeRegularization<EnumRegularization::L1> (double weight, const double& factorWeightDecay)
220  {
221  return weight == 0.0 ? 0.0 : std::copysign (factorWeightDecay, weight);
222  }
223 
224 // L2 regularization
225  template <>
226  inline double computeRegularization<EnumRegularization::L2> (double weight, const double& factorWeightDecay)
227  {
228  return factorWeightDecay * weight;
229  }
230 
231 
232 /*! \brief update the gradients, using regularization
233  *
234  *
235  */
236  template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient, typename ItWeight>
237  void update (ItSource itSource, ItSource itSourceEnd,
238  ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd,
239  ItTargetGradient itTargetGradientBegin,
240  ItGradient itGradient,
241  ItWeight itWeight, double weightDecay)
242  {
243  // ! the factor weightDecay has to be already scaled by 1/n where n is the number of weights
244  while (itSource != itSourceEnd)
245  {
246  auto itTargetDelta = itTargetDeltaBegin;
247  auto itTargetGradient = itTargetGradientBegin;
248  while (itTargetDelta != itTargetDeltaEnd)
249  {
250  (*itGradient) -= + (*itTargetDelta) * (*itSource) * (*itTargetGradient) + computeRegularization<Regularization>(*itWeight,weightDecay);
251  ++itTargetDelta; ++itTargetGradient; ++itGradient; ++itWeight;
252  }
253  ++itSource;
254  }
255  }
256 
257 
258 
259 
260 
261 
262 #define USELOCALWEIGHTS 1
263 
264 
265 
266 /*! \brief implementation of the steepest gradient descent algorithm
267  *
268  * Can be used with multithreading (i.e. "HogWild!" style); see call in trainCycle
269  */
270  template <typename Function, typename Weights, typename PassThrough>
271  double Steepest::operator() (Function& fitnessFunction, Weights& weights, PassThrough& passThrough)
272  {
273  size_t numWeights = weights.size ();
274  // std::vector<double> gradients (numWeights, 0.0);
275  m_localGradients.assign (numWeights, 0.0);
276  // std::vector<double> localWeights (begin (weights), end (weights));
277  // m_localWeights.reserve (numWeights);
278  m_localWeights.assign (begin (weights), end (weights));
279 
280  double E = 1e10;
281  if (m_prevGradients.size () != numWeights)
282  {
283  m_prevGradients.clear ();
284  m_prevGradients.assign (weights.size (), 0);
285  }
286 
287  bool success = true;
288  size_t currentRepetition = 0;
289  while (success)
290  {
291  if (currentRepetition >= m_repetitions)
292  break;
293 
294  m_localGradients.assign (numWeights, 0.0);
295 
296  // --- nesterov momentum ---
297  // apply momentum before computing the new gradient
298  auto itPrevG = begin (m_prevGradients);
299  auto itPrevGEnd = end (m_prevGradients);
300  auto itLocWeight = begin (m_localWeights);
301  for (; itPrevG != itPrevGEnd; ++itPrevG, ++itLocWeight)
302  {
303  (*itPrevG) *= m_beta;
304  (*itLocWeight) += (*itPrevG);
305  }
306 
307  E = fitnessFunction (passThrough, m_localWeights, m_localGradients);
308 // plotGradients (gradients);
309 // plotWeights (localWeights);
310 
311  double alpha = gaussDouble (m_alpha, m_alpha/2.0);
312 // double alpha = m_alpha;
313 
314  auto itG = begin (m_localGradients);
315  auto itGEnd = end (m_localGradients);
316  itPrevG = begin (m_prevGradients);
317  double maxGrad = 0.0;
318  for (; itG != itGEnd; ++itG, ++itPrevG)
319  {
320  double currGrad = (*itG);
321  double prevGrad = (*itPrevG);
322  currGrad *= alpha;
323 
324  //(*itPrevG) = m_beta * (prevGrad + currGrad);
325  currGrad += prevGrad;
326  (*itG) = currGrad;
327  (*itPrevG) = currGrad;
328 
329  if (std::fabs (currGrad) > maxGrad)
330  maxGrad = currGrad;
331  }
332 
333  if (maxGrad > 1)
334  {
335  m_alpha /= 2;
336  std::cout << "\nlearning rate reduced to " << m_alpha << std::endl;
337  std::for_each (weights.begin (), weights.end (), [maxGrad](double& w)
338  {
339  w /= maxGrad;
340  });
341  m_prevGradients.clear ();
342  }
343  else
344  {
345  auto itW = std::begin (weights);
346  std::for_each (std::begin (m_localGradients), std::end (m_localGradients), [&itW](double& g)
347  {
348  *itW += g;
349  ++itW;
350  });
351  }
352 
353  ++currentRepetition;
354  }
355  return E;
356  }
357 
358 
359 
360 
361 
362 
363 
364 
365 
366 
367 
368 
369 
370 
371 
372 
373 
374 
375 
376 
377 /*! \brief sum of squares error function
378  *
379  *
380  */
381  template <typename ItOutput, typename ItTruth, typename ItDelta, typename InvFnc>
382  double sumOfSquares (ItOutput itOutputBegin, ItOutput itOutputEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, InvFnc invFnc, double patternWeight)
383  {
384  double errorSum = 0.0;
385 
386  // output - truth
387  ItTruth itTruth = itTruthBegin;
388  bool hasDeltas = (itDelta != itDeltaEnd);
389  for (ItOutput itOutput = itOutputBegin; itOutput != itOutputEnd; ++itOutput, ++itTruth)
390  {
391 // assert (itTruth != itTruthEnd);
392  double output = (*itOutput);
393  double error = output - (*itTruth);
394  if (hasDeltas)
395  {
396  (*itDelta) = (*invFnc.get ()) (output) * error * patternWeight;
397  ++itDelta;
398  }
399  errorSum += error*error * patternWeight;
400  }
401 
402  return 0.5*errorSum;
403  }
404 
405 
406 
407 /*! \brief cross entropy error function
408  *
409  *
410  */
411  template <typename ItProbability, typename ItTruth, typename ItDelta, typename ItInvActFnc>
412  double crossEntropy (ItProbability itProbabilityBegin, ItProbability itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight)
413  {
414  bool hasDeltas = (itDelta != itDeltaEnd);
415 
416  double errorSum = 0.0;
417  for (ItProbability itProbability = itProbabilityBegin; itProbability != itProbabilityEnd; ++itProbability)
418  {
419  double probability = *itProbability;
420  double truth = *itTruthBegin;
421  /* truth = truth < 0.1 ? 0.1 : truth; */
422  /* truth = truth > 0.9 ? 0.9 : truth; */
423  truth = truth < 0.5 ? 0.1 : 0.9;
424  if (hasDeltas)
425  {
426  double delta = probability - truth;
427  (*itDelta) = delta*patternWeight;
428 // (*itDelta) = (*itInvActFnc)(probability) * delta * patternWeight;
429  ++itDelta;
430  }
431  double error (0);
432  if (probability == 0) // protection against log (0)
433  {
434  if (truth >= 0.5)
435  error += 1.0;
436  }
437  else if (probability == 1)
438  {
439  if (truth < 0.5)
440  error += 1.0;
441  }
442  else
443  error += - (truth * log (probability) + (1.0-truth) * log (1.0-probability)); // cross entropy function
444  errorSum += error * patternWeight;
445 
446  }
447  return errorSum;
448  }
449 
450 
451 
452 
453 /*! \brief soft-max-cross-entropy error function (for mutual exclusive cross-entropy)
454  *
455  *
456  */
457  template <typename ItOutput, typename ItTruth, typename ItDelta, typename ItInvActFnc>
458  double softMaxCrossEntropy (ItOutput itProbabilityBegin, ItOutput itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight)
459  {
460  double errorSum = 0.0;
461 
462  bool hasDeltas = (itDelta != itDeltaEnd);
463  // output - truth
464  ItTruth itTruth = itTruthBegin;
465  for (auto itProbability = itProbabilityBegin; itProbability != itProbabilityEnd; ++itProbability, ++itTruth)
466  {
467 // assert (itTruth != itTruthEnd);
468  double probability = (*itProbability);
469  double truth = (*itTruth);
470  if (hasDeltas)
471  {
472  (*itDelta) = probability - truth;
473 // (*itDelta) = (*itInvActFnc)(sm) * delta * patternWeight;
474  ++itDelta; //++itInvActFnc;
475  }
476  double error (0);
477 
478  error += truth * log (probability);
479  errorSum += error;
480  }
481 
482  return -errorSum * patternWeight;
483  }
484 
485 
486 
487 
488 
489 
490 
491 
492 
493 /*! \brief compute the weight decay for regularization (L1 or L2)
494  *
495  *
496  */
497  template <typename ItWeight>
498  double weightDecay (double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
499  {
500  if (eRegularization == EnumRegularization::L1)
501  {
502  // weight decay (regularization)
503  double w = 0;
504  size_t n = 0;
505  for (; itWeight != itWeightEnd; ++itWeight, ++n)
506  {
507  double weight = (*itWeight);
508  w += std::fabs (weight);
509  }
510  return error + 0.5 * w * factorWeightDecay / n;
511  }
512  else if (eRegularization == EnumRegularization::L2)
513  {
514  // weight decay (regularization)
515  double w = 0;
516  size_t n = 0;
517  for (; itWeight != itWeightEnd; ++itWeight, ++n)
518  {
519  double weight = (*itWeight);
520  w += weight*weight;
521  }
522  return error + 0.5 * w * factorWeightDecay / n;
523  }
524  else
525  return error;
526  }
527 
528 
529 
530 
531 
532 
533 
534 
535 
536 
537 
538 
539 
540 
541 /*! \brief apply the weights (and functions) in forward direction of the DNN
542  *
543  *
544  */
545  template <typename LAYERDATA>
546  void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
547  {
548  if (prevLayerData.hasDropOut ())
549  {
550  applyWeights<true> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
551  currLayerData.weightsBegin (),
552  currLayerData.valuesBegin (), currLayerData.valuesEnd (),
553  prevLayerData.dropOut ());
554  }
555  else
556  {
557  bool dummy = true;
558  applyWeights<false> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
559  currLayerData.weightsBegin (),
560  currLayerData.valuesBegin (), currLayerData.valuesEnd (),
561  &dummy); // dummy to turn on all nodes (no drop out)
562  }
563  }
564 
565 
566 
567 /*! \brief backward application of the weights (back-propagation of the error)
568  *
569  *
570  */
571 template <typename LAYERDATA>
572  void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
573 {
574  if (prevLayerData.hasDropOut ())
575  {
576  applyWeightsBackwards<true> (currLayerData.deltasBegin (), currLayerData.deltasEnd (),
577  currLayerData.weightsBegin (),
578  prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
579  prevLayerData.dropOut ());
580  }
581  else
582  {
583  bool dummy = true;
584  applyWeightsBackwards<false> (currLayerData.deltasBegin (), currLayerData.deltasEnd (),
585  currLayerData.weightsBegin (),
586  prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
587  &dummy); // dummy to use all nodes (no drop out)
588  }
589 }
590 
591 
592 
593 
594 
595 /*! \brief update the node values
596  *
597  *
598  */
599  template <typename LAYERDATA>
600  void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double factorWeightDecay, EnumRegularization regularization)
601  {
602  // ! the "factorWeightDecay" has already to be scaled by 1/n where n is the number of weights
603  if (factorWeightDecay != 0.0) // has weight regularization
604  if (regularization == EnumRegularization::L1) // L1 regularization ( sum(|w|) )
605  {
606  update<EnumRegularization::L1> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
607  currLayerData.deltasBegin (), currLayerData.deltasEnd (),
608  currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (),
609  currLayerData.weightsBegin (), factorWeightDecay);
610  }
611  else if (regularization == EnumRegularization::L2) // L2 regularization ( sum(w^2) )
612  {
613  update<EnumRegularization::L2> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
614  currLayerData.deltasBegin (), currLayerData.deltasEnd (),
615  currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (),
616  currLayerData.weightsBegin (), factorWeightDecay);
617  }
618  else
619  {
620  update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
621  currLayerData.deltasBegin (), currLayerData.deltasEnd (),
622  currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
623  }
624 
625  else
626  { // no weight regularization
627  update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
628  currLayerData.deltasBegin (), currLayerData.deltasEnd (),
629  currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
630  }
631  }
632 
633 
634 
635 
636 
637 
638 
639 
640 
641 
642 
643 
644 /*! \brief compute the drop-out-weight factor
645  *
646  * when using drop-out a fraction of the nodes is turned off at each cycle of the computation
647  * once all nodes are turned on again (for instances when the test samples are evaluated),
648  * the weights have to be adjusted to account for the different number of active nodes
649  * this function computes the factor and applies it to the weights
650  */
651  template <typename WeightsType, typename DropProbabilities>
652  void Net::dropOutWeightFactor (WeightsType& weights,
653  const DropProbabilities& drops,
654  bool inverse)
655  {
656  if (drops.empty () || weights.empty ())
657  return;
658 
659  auto itWeight = std::begin (weights);
660  auto itWeightEnd = std::end (weights);
661  auto itDrop = std::begin (drops);
662  auto itDropEnd = std::end (drops);
663  size_t numNodesPrev = inputSize ();
664  double dropFractionPrev = *itDrop;
665  ++itDrop;
666 
667  for (auto& layer : layers ())
668  {
669  if (itDrop == itDropEnd)
670  break;
671 
672  size_t _numNodes = layer.numNodes ();
673 
674  double dropFraction = *itDrop;
675  double pPrev = 1.0 - dropFractionPrev;
676  double p = 1.0 - dropFraction;
677  p *= pPrev;
678 
679  if (inverse)
680  {
681  p = 1.0/p;
682  }
683  size_t _numWeights = layer.numWeights (numNodesPrev);
684  for (size_t iWeight = 0; iWeight < _numWeights; ++iWeight)
685  {
686  if (itWeight == itWeightEnd)
687  break;
688 
689  *itWeight *= p;
690  ++itWeight;
691  }
692  numNodesPrev = _numNodes;
693  dropFractionPrev = dropFraction;
694  ++itDrop;
695  }
696  }
697 
698 
699 
700 
701 
702 
703 /*! \brief execute the training until convergence emerges
704  *
705  * \param weights the container with the weights (synapses)
706  * \param trainPattern the pattern for the training
707  * \param testPattern the pattern for the testing
708  * \param minimizer the minimizer (e.g. steepest gradient descent) to be used
709  * \param settings the settings for the training (e.g. multithreading or not, regularization etc.)
710  */
711  template <typename Minimizer>
712  double Net::train (std::vector<double>& weights,
713  std::vector<Pattern>& trainPattern,
714  const std::vector<Pattern>& testPattern,
715  Minimizer& minimizer,
716  Settings& settings)
717  {
718 // std::cout << "START TRAINING" << std::endl;
719  settings.startTrainCycle ();
720 
721  // JsMVA progress bar maximum (100%)
722  if (fIPyMaxIter) *fIPyMaxIter = 100;
723 
724  settings.pads (4);
725  settings.create ("trainErrors", 100, 0, 100, 100, 0,1);
726  settings.create ("testErrors", 100, 0, 100, 100, 0,1);
727 
728  size_t cycleCount = 0;
729  size_t testCycleCount = 0;
730  double testError = 1e20;
731  double trainError = 1e20;
732  size_t dropOutChangeCount = 0;
733 
734  DropContainer dropContainer;
735  DropContainer dropContainerTest;
736  const std::vector<double>& dropFractions = settings.dropFractions ();
737  bool isWeightsForDrop = false;
738 
739 
740  // until convergence
741  do
742  {
743  ++cycleCount;
744 
745  // if dropOut enabled
746  size_t dropIndex = 0;
747  if (!dropFractions.empty () && dropOutChangeCount % settings.dropRepetitions () == 0)
748  {
749  // fill the dropOut-container
750  dropContainer.clear ();
751  size_t _numNodes = inputSize ();
752  double dropFraction = 0.0;
753  dropFraction = dropFractions.at (dropIndex);
754  ++dropIndex;
755  fillDropContainer (dropContainer, dropFraction, _numNodes);
756  for (auto itLayer = begin (m_layers), itLayerEnd = end (m_layers); itLayer != itLayerEnd; ++itLayer, ++dropIndex)
757  {
758  auto& layer = *itLayer;
759  _numNodes = layer.numNodes ();
760  // how many nodes have to be dropped
761  dropFraction = 0.0;
762  if (dropFractions.size () > dropIndex)
763  dropFraction = dropFractions.at (dropIndex);
764 
765  fillDropContainer (dropContainer, dropFraction, _numNodes);
766  }
767  isWeightsForDrop = true;
768  }
769 
770  // execute training cycle
771  trainError = trainCycle (minimizer, weights, begin (trainPattern), end (trainPattern), settings, dropContainer);
772 
773 
774  // ------ check if we have to execute a test ------------------
775  bool hasConverged = false;
776  if (testCycleCount % settings.testRepetitions () == 0) // we test only everye "testRepetitions" repetition
777  {
778  if (isWeightsForDrop)
779  {
780  dropOutWeightFactor (weights, dropFractions);
781  isWeightsForDrop = false;
782  }
783 
784 
785  testError = 0;
786  //double weightSum = 0;
787  settings.startTestCycle ();
788  if (settings.useMultithreading ())
789  {
790  size_t numThreads = std::thread::hardware_concurrency ();
791  size_t patternPerThread = testPattern.size () / numThreads;
792  std::vector<Batch> batches;
793  auto itPat = testPattern.begin ();
794  // auto itPatEnd = testPattern.end ();
795  for (size_t idxThread = 0; idxThread < numThreads-1; ++idxThread)
796  {
797  batches.push_back (Batch (itPat, itPat + patternPerThread));
798  itPat += patternPerThread;
799  }
800  if (itPat != testPattern.end ())
801  batches.push_back (Batch (itPat, testPattern.end ()));
802 
803  std::vector<std::future<std::tuple<double,std::vector<double>>>> futures;
804  for (auto& batch : batches)
805  {
806  // -------------------- execute each of the batch ranges on a different thread -------------------------------
807  futures.push_back (
808  std::async (std::launch::async, [&]()
809  {
810  std::vector<double> localOutput;
811  pass_through_type passThrough (settings, batch, dropContainerTest);
812  double testBatchError = (*this) (passThrough, weights, ModeOutput::FETCH, localOutput);
813  return std::make_tuple (testBatchError, localOutput);
814  })
815  );
816  }
817 
818  auto itBatch = batches.begin ();
819  for (auto& f : futures)
820  {
821  std::tuple<double,std::vector<double>> result = f.get ();
822  testError += std::get<0>(result) / batches.size ();
823  std::vector<double> output = std::get<1>(result);
824  if (output.size() == (outputSize() - 1) * itBatch->size())
825  {
826  auto output_iterator = output.begin();
827  for (auto pattern_it = itBatch->begin(); pattern_it != itBatch->end(); ++pattern_it)
828  {
829  for (size_t output_index = 1; output_index < outputSize(); ++output_index)
830  {
831  settings.testSample (0, *output_iterator, (*pattern_it).output ().at (0),
832  (*pattern_it).weight ());
833  ++output_iterator;
834  }
835  }
836  }
837  ++itBatch;
838  }
839 
840  }
841  else
842  {
843  std::vector<double> output;
844  //for (auto it = begin (testPattern), itEnd = end (testPattern); it != itEnd; ++it)
845  {
846  //const Pattern& p = (*it);
847  //double weight = p.weight ();
848  //Batch batch (it, it+1);
849  Batch batch (begin (testPattern), end (testPattern));
850  output.clear ();
851  pass_through_type passThrough (settings, batch, dropContainerTest);
852  double testPatternError = (*this) (passThrough, weights, ModeOutput::FETCH, output);
853  if (output.size() == (outputSize() - 1) * batch.size())
854  {
855  auto output_iterator = output.begin();
856  for (auto pattern_it = batch.begin(); pattern_it != batch.end(); ++pattern_it)
857  {
858  for (size_t output_index = 1; output_index < outputSize(); ++output_index)
859  {
860  settings.testSample (0, *output_iterator, (*pattern_it).output ().at (0),
861  (*pattern_it).weight ());
862  ++output_iterator;
863  }
864  }
865  }
866  testError += testPatternError; /// batch.size ();
867  }
868  // testError /= testPattern.size ();
869  }
870  settings.endTestCycle ();
871 // testError /= weightSum;
872 
873  settings.computeResult (*this, weights);
874 
875  hasConverged = settings.hasConverged (testError);
876  if (!hasConverged && !isWeightsForDrop)
877  {
878  dropOutWeightFactor (weights, dropFractions, true); // inverse
879  isWeightsForDrop = true;
880  }
881  }
882  ++testCycleCount;
883  ++dropOutChangeCount;
884 
885 
886  static double x = -1.0;
887  x += 1.0;
888 // settings.resetPlot ("errors");
889  settings.addPoint ("trainErrors", cycleCount, trainError);
890  settings.addPoint ("testErrors", cycleCount, testError);
891  settings.plot ("trainErrors", "C", 1, kBlue);
892  settings.plot ("testErrors", "C", 1, kMagenta);
893 
894 
895  // setup error plots and progress bar variables for JsMVA
896  if (fInteractive){
897  fInteractive->AddPoint(cycleCount, trainError, testError);
898  if (*fExitFromTraining) break;
899  *fIPyCurrentIter = 100*(double)settings.maxConvergenceCount () /(double)settings.convergenceSteps ();
900  }
901 
902  if (hasConverged)
903  break;
904 
905  if ((int)cycleCount % 10 == 0) {
906 
907  TString convText = Form( "(train/test/epo/conv/maxco): %.3g/%.3g/%d/%d/%d",
908  trainError,
909  testError,
910  (int)cycleCount,
911  (int)settings.convergenceCount (),
912  (int)settings.maxConvergenceCount ());
913  double progress = 100*(double)settings.maxConvergenceCount () /(double)settings.convergenceSteps ();
914  settings.cycle (progress, convText);
915  }
916  }
917  while (true);
918  settings.endTrainCycle (trainError);
919 
920  TString convText = Form( "(train/test/epoch): %.4g/%.4g/%d", trainError, testError, (int)cycleCount);
921  double progress = 100*(double)settings.maxConvergenceCount() /(double)settings.convergenceSteps ();
922  settings.cycle (progress, convText);
923 
924  return testError;
925  }
926 
927 
928 
929 /*! \brief execute a single training cycle
930  *
931  * uses multithreading if turned on
932  *
933  * \param minimizer the minimizer to be used (e.g. SGD)
934  * \param weights the weight container with all the synapse weights
935  * \param itPatternBegin begin of the pattern container
936  * \parama itPatternEnd the end of the pattern container
937  * \param settings the settings for this training (e.g. multithreading or not, regularization, etc.)
938  * \param dropContainer the data for dropping-out nodes (regularization technique)
939  */
940  template <typename Iterator, typename Minimizer>
941  inline double Net::trainCycle (Minimizer& minimizer, std::vector<double>& weights,
942  Iterator itPatternBegin, Iterator itPatternEnd, Settings& settings, DropContainer& dropContainer)
943  {
944  double error = 0.0;
945  size_t numPattern = std::distance (itPatternBegin, itPatternEnd);
946  size_t numBatches = numPattern/settings.batchSize ();
947  size_t numBatches_stored = numBatches;
948 
949  std::shuffle(itPatternBegin, itPatternEnd, std::default_random_engine{});
950  Iterator itPatternBatchBegin = itPatternBegin;
951  Iterator itPatternBatchEnd = itPatternBatchBegin;
952 
953  // create batches
954  std::vector<Batch> batches;
955  while (numBatches > 0)
956  {
957  std::advance (itPatternBatchEnd, settings.batchSize ());
958  batches.push_back (Batch (itPatternBatchBegin, itPatternBatchEnd));
959  itPatternBatchBegin = itPatternBatchEnd;
960  --numBatches;
961  }
962 
963  // add the last pattern to the last batch
964  if (itPatternBatchEnd != itPatternEnd)
965  batches.push_back (Batch (itPatternBatchEnd, itPatternEnd));
966 
967 
968  ///< turn on multithreading if requested
969  if (settings.useMultithreading ())
970  {
971  // -------------------- divide the batches into bunches for each thread --------------
972  size_t numThreads = std::thread::hardware_concurrency ();
973  size_t batchesPerThread = batches.size () / numThreads;
974  typedef std::vector<Batch>::iterator batch_iterator;
975  std::vector<std::pair<batch_iterator,batch_iterator>> batchVec;
976  batch_iterator itBatchBegin = std::begin (batches);
977  batch_iterator itBatchCurrEnd = std::begin (batches);
978  batch_iterator itBatchEnd = std::end (batches);
979  for (size_t iT = 0; iT < numThreads; ++iT)
980  {
981  if (iT == numThreads-1)
982  itBatchCurrEnd = itBatchEnd;
983  else
984  std::advance (itBatchCurrEnd, batchesPerThread);
985  batchVec.push_back (std::make_pair (itBatchBegin, itBatchCurrEnd));
986  itBatchBegin = itBatchCurrEnd;
987  }
988 
989  // -------------------- loop over batches -------------------------------------------
990  std::vector<std::future<double>> futures;
991  for (auto& batchRange : batchVec)
992  {
993  // -------------------- execute each of the batch ranges on a different thread -------------------------------
994  futures.push_back (
995  std::async (std::launch::async, [&]()
996  {
997  double localError = 0.0;
998  for (auto it = batchRange.first, itEnd = batchRange.second; it != itEnd; ++it)
999  {
1000  Batch& batch = *it;
1001  pass_through_type settingsAndBatch (settings, batch, dropContainer);
1002  Minimizer minimizerClone (minimizer);
1003  localError += minimizerClone ((*this), weights, settingsAndBatch); /// call the minimizer
1004  }
1005  return localError;
1006  })
1007  );
1008  }
1009 
1010  for (auto& f : futures)
1011  error += f.get ();
1012  }
1013  else
1014  {
1015  for (auto& batch : batches)
1016  {
1017  std::tuple<Settings&, Batch&, DropContainer&> settingsAndBatch (settings, batch, dropContainer);
1018  error += minimizer ((*this), weights, settingsAndBatch);
1019  }
1020  }
1021 
1022  numBatches_stored = std::max (numBatches_stored, size_t(1)); /// normalize the error
1023  error /= numBatches_stored;
1024  settings.testIteration ();
1025 
1026  return error;
1027  }
1028 
1029 
1030 
1031 
1032 
1033 /*! \brief compute the neural net
1034  *
1035  * \param input the input data
1036  * \param weights the weight data
1037  */
1038  template <typename Weights>
1039  std::vector<double> Net::compute (const std::vector<double>& input, const Weights& weights) const
1040  {
1041  std::vector<LayerData> layerData;
1042  layerData.reserve (m_layers.size ()+1);
1043  auto itWeight = begin (weights);
1044  auto itInputBegin = begin (input);
1045  auto itInputEnd = end (input);
1046  layerData.push_back (LayerData (itInputBegin, itInputEnd));
1047  size_t numNodesPrev = input.size ();
1048 
1049  // -------------------- prepare layer data with one pattern -------------------------------
1050  for (auto& layer: m_layers)
1051  {
1052  layerData.push_back (LayerData (layer.numNodes (), itWeight,
1053  layer.activationFunction (),
1054  layer.modeOutputValues ()));
1055  size_t _numWeights = layer.numWeights (numNodesPrev);
1056  itWeight += _numWeights;
1057  numNodesPrev = layer.numNodes ();
1058  }
1059 
1060 
1061  // --------- forward -------------
1062  forwardPattern (m_layers, layerData);
1063 
1064  // ------------- fetch output ------------------
1065  std::vector<double> output;
1066  fetchOutput (layerData.back (), output);
1067  return output;
1068  }
1069 
1070 
1071  template <typename Weights, typename PassThrough>
1072  double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights) const
1073  {
1074  std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
1075  assert (numWeights () == weights.size ());
1076  double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (nothing), std::end (nothing), 10000, nothing, false);
1077  return error;
1078  }
1079 
1080  template <typename Weights, typename PassThrough, typename OutContainer>
1081  double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights, ModeOutput /*eFetch*/, OutContainer& outputContainer) const
1082  {
1083  std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
1084  assert (numWeights () == weights.size ());
1085  double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (nothing), std::end (nothing), 10000, outputContainer, true);
1086  return error;
1087  }
1088 
1089 
1090  template <typename Weights, typename Gradients, typename PassThrough>
1091  double Net::operator() (PassThrough& settingsAndBatch, Weights& weights, Gradients& gradients) const
1092  {
1093  std::vector<double> nothing;
1094  assert (numWeights () == weights.size ());
1095  assert (weights.size () == gradients.size ());
1096  double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (gradients), std::end (gradients), 0, nothing, false);
1097  return error;
1098  }
1099 
1100  template <typename Weights, typename Gradients, typename PassThrough, typename OutContainer>
1101  double Net::operator() (PassThrough& settingsAndBatch, Weights& weights, Gradients& gradients, ModeOutput eFetch, OutContainer& outputContainer) const
1102  {
1103  MATH_UNUSED(eFetch);
1104  assert (numWeights () == weights.size ());
1105  assert (weights.size () == gradients.size ());
1106  double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (gradients), std::end (gradients), 0, outputContainer, true);
1107  return error;
1108  }
1109 
1110 
1111 
1112  template <typename LayerContainer, typename DropContainer, typename ItWeight, typename ItGradient>
1113  std::vector<std::vector<LayerData>> Net::prepareLayerData (LayerContainer& _layers,
1114  Batch& batch,
1115  const DropContainer& dropContainer,
1116  ItWeight itWeightBegin,
1117  ItWeight /*itWeightEnd*/,
1118  ItGradient itGradientBegin,
1119  ItGradient itGradientEnd,
1120  size_t& totalNumWeights) const
1121  {
1122  LayerData::const_dropout_iterator itDropOut;
1123  bool usesDropOut = !dropContainer.empty ();
1124  if (usesDropOut)
1125  itDropOut = std::begin (dropContainer);
1126 
1127  if (_layers.empty ())
1128  throw std::string ("no layers in this net");
1129 
1130 
1131  // ----------- create layer data -------------------------------------------------------
1132  //LM- This assert not needed anymore (outputsize is actually numNodes+1)
1133  //assert (_layers.back ().numNodes () == outputSize ());
1134  totalNumWeights = 0;
1135  size_t totalNumNodes = 0;
1136  std::vector<std::vector<LayerData>> layerPatternData;
1137  layerPatternData.reserve (_layers.size ()+1);
1138  ItWeight itWeight = itWeightBegin;
1139  ItGradient itGradient = itGradientBegin;
1140  size_t numNodesPrev = inputSize ();
1141  typename Pattern::const_iterator itInputBegin;
1142  typename Pattern::const_iterator itInputEnd;
1143 
1144  // ItWeight itGammaBegin = itWeightBegin + numWeights ();
1145  // ItWeight itBetaBegin = itWeightBegin + numWeights () + numNodes ();
1146  // ItGradient itGradGammaBegin = itGradientBegin + numWeights ();
1147  // ItGradient itGradBetaBegin = itGradientBegin + numWeights () + numNodes ();
1148 
1149 
1150  // --------------------- prepare layer data for input layer ----------------------------
1151  layerPatternData.push_back (std::vector<LayerData>());
1152  for (const Pattern& _pattern : batch)
1153  {
1154  std::vector<LayerData>& layerData = layerPatternData.back ();
1155  layerData.push_back (LayerData (numNodesPrev));
1156 
1157  itInputBegin = _pattern.beginInput ();
1158  itInputEnd = _pattern.endInput ();
1159  layerData.back ().setInput (itInputBegin, itInputEnd);
1160 
1161  if (usesDropOut)
1162  layerData.back ().setDropOut (itDropOut);
1163 
1164  }
1165 
1166 
1167  if (usesDropOut)
1168  itDropOut += _layers.back ().numNodes ();
1169 
1170  // ---------------- prepare subsequent layers ---------------------------------------------
1171  // for each of the layers
1172  for (auto itLayer = begin (_layers), itLayerEnd = end (_layers); itLayer != itLayerEnd; ++itLayer)
1173  {
1174  bool isOutputLayer = (itLayer+1 == itLayerEnd);
1175  bool isFirstHiddenLayer = (itLayer == begin (_layers));
1176 
1177  auto& layer = *itLayer;
1178  layerPatternData.push_back (std::vector<LayerData>());
1179  // for each pattern, prepare a layerData
1180  for (const Pattern& _pattern : batch)
1181  {
1182  std::vector<LayerData>& layerData = layerPatternData.back ();
1183  //layerData.push_back (LayerData (numNodesPrev));
1184 
1185  if (itGradientBegin == itGradientEnd)
1186  {
1187  layerData.push_back (LayerData (layer.numNodes (), itWeight,
1188  layer.activationFunction (),
1189  layer.modeOutputValues ()));
1190  }
1191  else
1192  {
1193  layerData.push_back (LayerData (layer.numNodes (), itWeight, itGradient,
1194  layer.activationFunction (),
1195  layer.inverseActivationFunction (),
1196  layer.modeOutputValues ()));
1197  }
1198 
1199  if (usesDropOut)
1200  {
1201  layerData.back ().setDropOut (itDropOut);
1202  }
1203 
1204  }
1205 
1206  if (usesDropOut)
1207  {
1208  itDropOut += layer.numNodes ();
1209  }
1210  size_t _numWeights = layer.numWeights (numNodesPrev);
1211  totalNumWeights += _numWeights;
1212  itWeight += _numWeights;
1213  itGradient += _numWeights;
1214  numNodesPrev = layer.numNodes ();
1215  totalNumNodes += numNodesPrev;
1216 
1217  }
1218  assert (totalNumWeights > 0);
1219  return layerPatternData;
1220 }
1221 
1222 
1223 
1224  template <typename LayerContainer>
1225  void Net::forwardPattern (const LayerContainer& _layers,
1226  std::vector<LayerData>& layerData) const
1227  {
1228  size_t idxLayer = 0, idxLayerEnd = _layers.size ();
1229  size_t cumulativeNodeCount = 0;
1230  for (; idxLayer < idxLayerEnd; ++idxLayer)
1231  {
1232  LayerData& prevLayerData = layerData.at (idxLayer);
1233  LayerData& currLayerData = layerData.at (idxLayer+1);
1234 
1235  forward (prevLayerData, currLayerData);
1236 
1237  applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
1238  }
1239  }
1240 
1241 
1242 
1243 
1244  template <typename LayerContainer, typename LayerPatternContainer>
1245  void Net::forwardBatch (const LayerContainer& _layers,
1246  LayerPatternContainer& layerPatternData,
1247  std::vector<double>& valuesMean,
1248  std::vector<double>& valuesStdDev,
1249  size_t trainFromLayer) const
1250  {
1251  valuesMean.clear ();
1252  valuesStdDev.clear ();
1253 
1254  // ---------------------------------- loop over layers and pattern -------------------------------------------------------
1255  size_t cumulativeNodeCount = 0;
1256  for (size_t idxLayer = 0, idxLayerEnd = layerPatternData.size (); idxLayer < idxLayerEnd-1; ++idxLayer)
1257  {
1258  bool doTraining = idxLayer >= trainFromLayer;
1259 
1260  // get layer-pattern data for this and the corresponding one from the next layer
1261  std::vector<LayerData>& prevLayerPatternData = layerPatternData.at (idxLayer);
1262  std::vector<LayerData>& currLayerPatternData = layerPatternData.at (idxLayer+1);
1263 
1264  size_t numPattern = prevLayerPatternData.size ();
1265  size_t numNodesLayer = _layers.at (idxLayer).numNodes ();
1266 
1267  std::vector<MeanVariance> means (numNodesLayer);
1268  // ---------------- loop over layerDatas of pattern compute forward ----------------------------
1269  for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
1270  {
1271  const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
1272  LayerData& currLayerData = currLayerPatternData.at (idxPattern);
1273 
1274 
1275  forward (prevLayerData, currLayerData); // feed forward
1276  }
1277 
1278  // ---------------- loop over layerDatas of pattern apply non-linearities ----------------------------
1279  for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
1280  {
1281  //const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
1282  LayerData& currLayerData = currLayerPatternData.at (idxPattern);
1283 
1284  if (doTraining)
1285  applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction (),
1286  currLayerData.inverseActivationFunction (), currLayerData.valueGradientsBegin ());
1287  else
1288  applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
1289  }
1290 
1291  // accumulate node count
1292  cumulativeNodeCount += numNodesLayer;
1293  }
1294 }
1295 
1296 
1297 
1298 
1299  template <typename OutputContainer>
1300  void Net::fetchOutput (const LayerData& lastLayerData, OutputContainer& outputContainer) const
1301  {
1302  ModeOutputValues eModeOutput = lastLayerData.outputMode ();
1303  if (isFlagSet (ModeOutputValues::DIRECT, eModeOutput))
1304  {
1305  outputContainer.insert (outputContainer.end (), lastLayerData.valuesBegin (), lastLayerData.valuesEnd ());
1306  }
1307  else if (isFlagSet (ModeOutputValues::SIGMOID, eModeOutput) ||
1308  isFlagSet (ModeOutputValues::SOFTMAX, eModeOutput))
1309  {
1310  const auto& prob = lastLayerData.probabilities ();
1311  outputContainer.insert (outputContainer.end (), prob.begin (), prob.end ()) ;
1312  }
1313  else
1314  assert (false);
1315  }
1316 
1317 
1318 
1319 
1320  template <typename OutputContainer>
1321  void Net::fetchOutput (const std::vector<LayerData>& lastLayerPatternData, OutputContainer& outputContainer) const
1322  {
1323  for (const LayerData& lastLayerData : lastLayerPatternData)
1324  fetchOutput (lastLayerData, outputContainer);
1325  }
1326 
1327 
1328 
1329  template <typename ItWeight>
1330  std::tuple</*sumError*/double,/*sumWeights*/double> Net::computeError (const Settings& settings,
1331  std::vector<LayerData>& lastLayerData,
1332  Batch& batch,
1333  ItWeight itWeightBegin,
1334  ItWeight itWeightEnd) const
1335  {
1336  typename std::vector<LayerData>::iterator itLayerData = lastLayerData.begin ();
1337 // typename std::vector<LayerData>::iterator itLayerDataEnd = lastLayerData.end ();
1338 
1339  typename std::vector<Pattern>::const_iterator itPattern = batch.begin ();
1340  typename std::vector<Pattern>::const_iterator itPatternEnd = batch.end ();
1341 
1342  double sumWeights (0.0);
1343  double sumError (0.0);
1344 
1345  size_t idxPattern = 0;
1346 // FIXME: check that iteration doesn't go beyond itLayerDataEnd!
1347  for ( ; itPattern != itPatternEnd; ++itPattern, ++itLayerData)
1348  {
1349  ++idxPattern;
1350 
1351  // compute E and the deltas of the computed output and the true output
1352  LayerData& layerData = (*itLayerData);
1353  const Pattern& _pattern = (*itPattern);
1354  double error = errorFunction (layerData, _pattern.output (),
1355  itWeightBegin, itWeightEnd,
1356  _pattern.weight (), settings.factorWeightDecay (),
1357  settings.regularization ());
1358  sumWeights += fabs (_pattern.weight ());
1359  sumError += error;
1360  }
1361  return std::make_tuple (sumError, sumWeights);
1362  }
1363 
1364 
1365 
1366  template <typename Settings>
1367  void Net::backPropagate (std::vector<std::vector<LayerData>>& layerPatternData,
1368  const Settings& settings,
1369  size_t trainFromLayer,
1370  size_t totalNumWeights) const
1371  {
1372  bool doTraining = layerPatternData.size () > trainFromLayer;
1373  if (doTraining) // training
1374  {
1375  // ------------- backpropagation -------------
1376  size_t idxLayer = layerPatternData.size ();
1377  for (auto itLayerPatternData = layerPatternData.rbegin (), itLayerPatternDataBegin = layerPatternData.rend ();
1378  itLayerPatternData != itLayerPatternDataBegin; ++itLayerPatternData)
1379  {
1380  --idxLayer;
1381  if (idxLayer <= trainFromLayer) // no training
1382  break;
1383 
1384  std::vector<LayerData>& currLayerDataColl = *(itLayerPatternData);
1385  std::vector<LayerData>& prevLayerDataColl = *(itLayerPatternData+1);
1386 
1387  size_t idxPattern = 0;
1388 // FIXME: check that itPrevLayerData doesn't go beyond itPrevLayerDataEnd!
1389  for (typename std::vector<LayerData>::iterator itCurrLayerData = begin (currLayerDataColl), itCurrLayerDataEnd = end (currLayerDataColl),
1390  itPrevLayerData = begin (prevLayerDataColl) /*, itPrevLayerDataEnd = end (prevLayerDataColl)*/;
1391  itCurrLayerData != itCurrLayerDataEnd; ++itCurrLayerData, ++itPrevLayerData, ++idxPattern)
1392  {
1393  LayerData& currLayerData = (*itCurrLayerData);
1394  LayerData& prevLayerData = *(itPrevLayerData);
1395 
1396  backward (prevLayerData, currLayerData);
1397 
1398  // the factorWeightDecay has to be scaled by 1/n where n is the number of weights (synapses)
1399  // because L1 and L2 regularization
1400  //
1401  // http://neuralnetworksanddeeplearning.com/chap3.html#overfitting_and_regularization
1402  //
1403  // L1 : -factorWeightDecay*sgn(w)/numWeights
1404  // L2 : -factorWeightDecay/numWeights
1405  update (prevLayerData, currLayerData, settings.factorWeightDecay ()/totalNumWeights, settings.regularization ());
1406  }
1407  }
1408  }
1409  }
1410 
1411 
1412 
1413 /*! \brief forward propagation and backward propagation
1414  *
1415  *
1416  */
1417  template <typename LayerContainer, typename PassThrough, typename ItWeight, typename ItGradient, typename OutContainer>
1418  double Net::forward_backward (LayerContainer& _layers, PassThrough& settingsAndBatch,
1419  ItWeight itWeightBegin, ItWeight itWeightEnd,
1420  ItGradient itGradientBegin, ItGradient itGradientEnd,
1421  size_t trainFromLayer,
1422  OutContainer& outputContainer, bool doFetchOutput) const
1423  {
1424  Settings& settings = std::get<0>(settingsAndBatch);
1425  Batch& batch = std::get<1>(settingsAndBatch);
1426  DropContainer& dropContainer = std::get<2>(settingsAndBatch);
1427 
1428  double sumError = 0.0;
1429  double sumWeights = 0.0; // -------------
1430 
1431 
1432  // ----------------------------- prepare layer data -------------------------------------
1433  size_t totalNumWeights (0);
1434  std::vector<std::vector<LayerData>> layerPatternData = prepareLayerData (_layers,
1435  batch,
1436  dropContainer,
1437  itWeightBegin,
1438  itWeightEnd,
1439  itGradientBegin,
1440  itGradientEnd,
1441  totalNumWeights);
1442 
1443 
1444 
1445  // ---------------------------------- propagate forward ------------------------------------------------------------------
1446  std::vector<double> valuesMean;
1447  std::vector<double> valuesStdDev;
1448  forwardBatch (_layers, layerPatternData, valuesMean, valuesStdDev, trainFromLayer);
1449 
1450 
1451  // ------------- fetch output ------------------
1452  if (doFetchOutput)
1453  {
1454  fetchOutput (layerPatternData.back (), outputContainer);
1455  }
1456 
1457 
1458  // ------------- error computation -------------
1459  std::tie (sumError, sumWeights) = computeError (settings, layerPatternData.back (), batch, itWeightBegin, itWeightBegin + totalNumWeights);
1460 
1461 
1462  // ------------- backpropagation -------------
1463  backPropagate (layerPatternData, settings, trainFromLayer, totalNumWeights);
1464 
1465 
1466  // --- compile the measures
1467  double batchSize = std::distance (std::begin (batch), std::end (batch));
1468  for (auto it = itGradientBegin; it != itGradientEnd; ++it)
1469  (*it) /= batchSize;
1470 
1471 
1472  sumError /= sumWeights;
1473  return sumError;
1474  }
1475 
1476 
1477 
1478 /*! \brief initialization of the weights
1479  *
1480  *
1481  */
1482  template <typename OutIterator>
1483  void Net::initializeWeights (WeightInitializationStrategy eInitStrategy, OutIterator itWeight)
1484  {
1485  if (eInitStrategy == WeightInitializationStrategy::XAVIER)
1486  {
1487  // input and output properties
1488  int numInput = inputSize ();
1489 
1490  // compute variance and mean of input and output
1491  //...
1492 
1493 
1494  // compute the weights
1495  for (auto& layer: layers ())
1496  {
1497  double nIn = numInput;
1498  double stdDev = sqrt (2.0/nIn);
1499  for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1500  {
1501  (*itWeight) = DNN::gaussDouble (0.0, stdDev); // factor 2.0 for ReLU
1502  ++itWeight;
1503  }
1504  numInput = layer.numNodes ();
1505  }
1506  return;
1507  }
1508 
1509  if (eInitStrategy == WeightInitializationStrategy::XAVIERUNIFORM)
1510  {
1511  // input and output properties
1512  int numInput = inputSize ();
1513 
1514  // compute variance and mean of input and output
1515  //...
1516 
1517 
1518  // compute the weights
1519  for (auto& layer: layers ())
1520  {
1521  double nIn = numInput;
1522  double minVal = -sqrt(2.0/nIn);
1523  double maxVal = sqrt (2.0/nIn);
1524  for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1525  {
1526 
1527  (*itWeight) = DNN::uniformDouble (minVal, maxVal); // factor 2.0 for ReLU
1528  ++itWeight;
1529  }
1530  numInput = layer.numNodes ();
1531  }
1532  return;
1533  }
1534 
1535  if (eInitStrategy == WeightInitializationStrategy::TEST)
1536  {
1537  // input and output properties
1538  int numInput = inputSize ();
1539 
1540  // compute variance and mean of input and output
1541  //...
1542 
1543 
1544  // compute the weights
1545  for (auto& layer: layers ())
1546  {
1547 // double nIn = numInput;
1548  for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1549  {
1550  (*itWeight) = DNN::gaussDouble (0.0, 0.1);
1551  ++itWeight;
1552  }
1553  numInput = layer.numNodes ();
1554  }
1555  return;
1556  }
1557 
1558  if (eInitStrategy == WeightInitializationStrategy::LAYERSIZE)
1559  {
1560  // input and output properties
1561  int numInput = inputSize ();
1562 
1563  // compute variance and mean of input and output
1564  //...
1565 
1566 
1567  // compute the weights
1568  for (auto& layer: layers ())
1569  {
1570  double nIn = numInput;
1571  for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1572  {
1573  (*itWeight) = DNN::gaussDouble (0.0, sqrt (layer.numWeights (nIn))); // factor 2.0 for ReLU
1574  ++itWeight;
1575  }
1576  numInput = layer.numNodes ();
1577  }
1578  return;
1579  }
1580 
1581  }
1582 
1583 
1584 
1585 
1586 
1587 /*! \brief compute the error function
1588  *
1589  *
1590  */
1591  template <typename Container, typename ItWeight>
1592  double Net::errorFunction (LayerData& layerData,
1593  Container truth,
1594  ItWeight itWeight,
1595  ItWeight itWeightEnd,
1596  double patternWeight,
1597  double factorWeightDecay,
1598  EnumRegularization eRegularization) const
1599  {
1600  double error (0);
1601  switch (m_eErrorFunction)
1602  {
1603  case ModeErrorFunction::SUMOFSQUARES:
1604  {
1605  error = sumOfSquares (layerData.valuesBegin (), layerData.valuesEnd (), begin (truth), end (truth),
1606  layerData.deltasBegin (), layerData.deltasEnd (),
1607  layerData.inverseActivationFunction (),
1608  patternWeight);
1609  break;
1610  }
1611  case ModeErrorFunction::CROSSENTROPY:
1612  {
1613  assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ()));
1614  std::vector<double> probabilities = layerData.probabilities ();
1615  error = crossEntropy (begin (probabilities), end (probabilities),
1616  begin (truth), end (truth),
1617  layerData.deltasBegin (), layerData.deltasEnd (),
1618  layerData.inverseActivationFunction (),
1619  patternWeight);
1620  break;
1621  }
1622  case ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE:
1623  {
1624  std::cout << "softmax." << std::endl;
1625  assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ()));
1626  std::vector<double> probabilities = layerData.probabilities ();
1627  error = softMaxCrossEntropy (begin (probabilities), end (probabilities),
1628  begin (truth), end (truth),
1629  layerData.deltasBegin (), layerData.deltasEnd (),
1630  layerData.inverseActivationFunction (),
1631  patternWeight);
1632  break;
1633  }
1634  }
1635  if (factorWeightDecay != 0 && eRegularization != EnumRegularization::NONE)
1636  {
1637  error = weightDecay (error, itWeight, itWeightEnd, factorWeightDecay, eRegularization);
1638  }
1639  return error;
1640  }
1641 
1642 
1643 
1644 
1645 
1646 
1647 
1648 // /*! \brief pre-training
1649 // *
1650 // * in development
1651 // */
1652 // template <typename Minimizer>
1653 // void Net::preTrain (std::vector<double>& weights,
1654 // std::vector<Pattern>& trainPattern,
1655 // const std::vector<Pattern>& testPattern,
1656 // Minimizer& minimizer, Settings& settings)
1657 // {
1658 // auto itWeightGeneral = std::begin (weights);
1659 // std::vector<Pattern> prePatternTrain (trainPattern.size ());
1660 // std::vector<Pattern> prePatternTest (testPattern.size ());
1661 
1662 // size_t _inputSize = inputSize ();
1663 
1664 // // transform pattern using the created preNet
1665 // auto initializePrePattern = [&](const std::vector<Pattern>& pttrnInput, std::vector<Pattern>& pttrnOutput)
1666 // {
1667 // pttrnOutput.clear ();
1668 // std::transform (std::begin (pttrnInput), std::end (pttrnInput),
1669 // std::back_inserter (pttrnOutput),
1670 // [](const Pattern& p)
1671 // {
1672 // Pattern pat (p.input (), p.input (), p.weight ());
1673 // return pat;
1674 // });
1675 // };
1676 
1677 // initializePrePattern (trainPattern, prePatternTrain);
1678 // initializePrePattern (testPattern, prePatternTest);
1679 
1680 // std::vector<double> originalDropFractions = settings.dropFractions ();
1681 
1682 // for (auto& _layer : layers ())
1683 // {
1684 // // compute number of weights (as a function of the number of incoming nodes)
1685 // // fetch number of nodes
1686 // size_t numNodes = _layer.numNodes ();
1687 // size_t _numWeights = _layer.numWeights (_inputSize);
1688 
1689 // // ------------------
1690 // DNN::Net preNet;
1691 // if (!originalDropFractions.empty ())
1692 // {
1693 // originalDropFractions.erase (originalDropFractions.begin ());
1694 // settings.setDropOut (originalDropFractions.begin (), originalDropFractions.end (), settings.dropRepetitions ());
1695 // }
1696 // std::vector<double> preWeights;
1697 
1698 // // define the preNet (pretraining-net) for this layer
1699 // // outputSize == inputSize, because this is an autoencoder;
1700 // preNet.setInputSize (_inputSize);
1701 // preNet.addLayer (DNN::Layer (numNodes, _layer.activationFunctionType ()));
1702 // preNet.addLayer (DNN::Layer (_inputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::DIRECT));
1703 // preNet.setErrorFunction (DNN::ModeErrorFunction::SUMOFSQUARES);
1704 // preNet.setOutputSize (_inputSize); // outputSize is the inputSize (autoencoder)
1705 
1706 // // initialize weights
1707 // preNet.initializeWeights (DNN::WeightInitializationStrategy::XAVIERUNIFORM,
1708 // std::back_inserter (preWeights));
1709 
1710 // // overwrite already existing weights from the "general" weights
1711 // std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ());
1712 // std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ()+_numWeights); // set identical weights for the temporary output layer
1713 
1714 
1715 // // train the "preNet"
1716 // preNet.train (preWeights, prePatternTrain, prePatternTest, minimizer, settings);
1717 
1718 // // fetch the pre-trained weights (without the output part of the autoencoder)
1719 // std::copy (std::begin (preWeights), std::begin (preWeights) + _numWeights, itWeightGeneral);
1720 
1721 // // advance the iterator on the incoming weights
1722 // itWeightGeneral += _numWeights;
1723 
1724 // // remove the weights of the output layer of the preNet
1725 // preWeights.erase (preWeights.begin () + _numWeights, preWeights.end ());
1726 
1727 // // remove the outputLayer of the preNet
1728 // preNet.removeLayer ();
1729 
1730 // // set the output size to the number of nodes in the new output layer (== last hidden layer)
1731 // preNet.setOutputSize (numNodes);
1732 
1733 // // transform pattern using the created preNet
1734 // auto proceedPattern = [&](std::vector<Pattern>& pttrn)
1735 // {
1736 // std::vector<Pattern> newPttrn;
1737 // std::for_each (std::begin (pttrn), std::end (pttrn),
1738 // [&preNet,&preWeights,&newPttrn](Pattern& p)
1739 // {
1740 // std::vector<double> output = preNet.compute (p.input (), preWeights);
1741 // Pattern pat (output, output, p.weight ());
1742 // newPttrn.push_back (pat);
1743 // // p = pat;
1744 // });
1745 // return newPttrn;
1746 // };
1747 
1748 
1749 // prePatternTrain = proceedPattern (prePatternTrain);
1750 // prePatternTest = proceedPattern (prePatternTest);
1751 
1752 
1753 // // the new input size is the output size of the already reduced preNet
1754 // _inputSize = preNet.layers ().back ().numNodes ();
1755 // }
1756 // }
1757 
1758 
1759 
1760 
1761 
1762 
1763 
1764 
1765 
1766 
1767 
1768 
1769 
1770 
1771 
1772 
1773  } // namespace DNN
1774 } // namespace TMVA
1775 
1776 #endif