Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RColumnValue.hxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 09/2018
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RCOLUMNVALUE
12 #define ROOT_RCOLUMNVALUE
13 
15 #include <ROOT/RDF/Utils.hxx> // IsRVec_t, TypeID2TypeName
17 #include <ROOT/RMakeUnique.hxx>
18 #include <ROOT/RVec.hxx>
19 #include <ROOT/TypeTraits.hxx> // TakeFirstParameter_t
20 #include <RtypesCore.h>
21 #include <TTreeReader.h>
22 #include <TTreeReaderValue.h>
23 #include <TTreeReaderArray.h>
24 
25 #include <cstring> // strcmp
26 #include <initializer_list>
27 #include <limits>
28 #include <memory>
29 #include <stdexcept>
30 #include <string>
31 #include <tuple>
32 #include <type_traits>
33 #include <vector>
34 
35 namespace ROOT {
36 namespace Internal {
37 namespace RDF {
38 using namespace ROOT::VecOps;
39 
40 /**
41 \class ROOT::Internal::RDF::RColumnValue
42 \ingroup dataframe
43 \brief Helper class that updates and returns TTree branches as well as RDataFrame temporary columns
44 \tparam T The type of the column
45 
46 RDataFrame nodes must access two different types of values during the event loop:
47 values of real branches, for which TTreeReader{Values,Arrays} act as proxies, or
48 temporary columns whose values are generated on the fly. While the type of the
49 value is known at compile time (or just-in-time), it is only at runtime that nodes
50 can check whether a certain value is generated on the fly or not.
51 
52 RColumnValue abstracts this difference by providing the same interface for
53 both cases and handling the reading or generation of new values transparently.
54 Only one of the two data members fReaderProxy or fValuePtr will be non-null
55 for a given RColumnValue, depending on whether the value comes from a real
56 TTree branch or from a temporary column respectively.
57 
58 RDataFrame nodes can store tuples of RColumnValues and retrieve an updated
59 value for the column via the `Get` method.
60 **/
61 template <typename T>
62 class R__CLING_PTRCHECK(off) RColumnValue {
63 // R__CLING_PTRCHECK is disabled because all pointers are hand-crafted by RDF.
64 
65  using MustUseRVec_t = IsRVec_t<T>;
66 
67  // ColumnValue_t is the type of the column or the type of the elements of an array column
68  using ColumnValue_t = typename std::conditional<MustUseRVec_t::value, TakeFirstParameter_t<T>, T>::type;
69  using TreeReader_t = typename std::conditional<MustUseRVec_t::value, TTreeReaderArray<ColumnValue_t>,
70  TTreeReaderValue<ColumnValue_t>>::type;
71 
72  /// RColumnValue has a slightly different behaviour whether the column comes from a TTreeReader, a RDataFrame Define
73  /// or a RDataSource. It stores which it is as an enum.
74  enum class EColumnKind { kTree, kCustomColumn, kDataSource, kInvalid };
75  // Set to the correct value by MakeProxy or SetTmpColumn
76  EColumnKind fColumnKind = EColumnKind::kInvalid;
77  /// The slot this value belongs to. Only needed when querying custom column values, it is set in `SetTmpColumn`.
78  unsigned int fSlot = std::numeric_limits<unsigned int>::max();
79 
80  // Each element of the following stacks will be in use by a _single task_.
81  // Each task will push one element when it starts and pop it when it ends.
82  // Stacks will typically be very small (1-2 elements typically) and will only grow over size 1 in case of interleaved
83  // task execution i.e. when more than one task needs readers in this worker thread.
84 
85  /// Owning ptrs to a TTreeReaderValue or TTreeReaderArray. Only used for Tree columns.
86  std::unique_ptr<TreeReader_t> fTreeReader;
87  /// Non-owning ptrs to the value of a custom column.
88  T *fCustomValuePtr;
89  /// Non-owning ptrs to the value of a data-source column.
90  T **fDSValuePtr;
91  /// Non-owning ptrs to the node responsible for the custom column. Needed when querying custom values.
92  RCustomColumnBase *fCustomColumn;
93  /// Enumerator for the different properties of the branch storage in memory
94  enum class EStorageType : char { kContiguous, kUnknown, kSparse };
95  /// Signal whether we ever checked that the branch we are reading with a TTreeReaderArray stores array elements
96  /// in contiguous memory. Only used when T == RVec<U>.
97  EStorageType fStorageType = EStorageType::kUnknown;
98  /// If MustUseRVec, i.e. we are reading an array, we return a reference to this RVec to clients
99  RVec<ColumnValue_t> fRVec;
100  bool fCopyWarningPrinted = false;
101 
102 public:
103  RColumnValue(){};
104 
105  void SetTmpColumn(unsigned int slot, RCustomColumnBase *customColumn)
106  {
107  fCustomColumn = customColumn;
108  // Here we compare names and not typeinfos since they may come from two different contexts: a compiled
109  // and a jitted one.
110  const auto diffTypes = (0 != strcmp(customColumn->GetTypeId().name(), typeid(T).name()));
111  auto inheritedType = [&](){
112  auto colTClass = TClass::GetClass(customColumn->GetTypeId());
113  return colTClass && colTClass->InheritsFrom(TClass::GetClass<T>());
114  };
115 
116  if (diffTypes && !inheritedType()) {
117  const auto tName = TypeID2TypeName(typeid(T));
118  const auto colTypeName = TypeID2TypeName(customColumn->GetTypeId());
119  std::string errMsg = "RColumnValue: type specified for column \"" +
120  customColumn->GetName() + "\" is ";
121  if (tName.empty()) {
122  errMsg += typeid(T).name();
123  errMsg += " (extracted from type info)";
124  } else {
125  errMsg += tName;
126  }
127  errMsg += " but temporary column has type ";
128  if (colTypeName.empty()) {
129  auto &id = customColumn->GetTypeId();
130  errMsg += id.name();
131  errMsg += " (extracted from type info)";
132  } else {
133  errMsg += colTypeName;
134  }
135  throw std::runtime_error(errMsg);
136  }
137 
138  if (customColumn->IsDataSourceColumn()) {
139  fColumnKind = EColumnKind::kDataSource;
140  fDSValuePtr = static_cast<T **>(customColumn->GetValuePtr(slot));
141  } else {
142  fColumnKind = EColumnKind::kCustomColumn;
143  fCustomValuePtr = static_cast<T *>(customColumn->GetValuePtr(slot));
144  }
145  fSlot = slot;
146  }
147 
148  void MakeProxy(TTreeReader *r, const std::string &bn)
149  {
150  fColumnKind = EColumnKind::kTree;
151  fTreeReader = std::make_unique<TreeReader_t>(*r, bn.c_str());
152  }
153 
154  /// This overload is used to return scalar quantities (i.e. types that are not read into a RVec)
155  // This method is executed inside the event-loop, many times per entry
156  // If need be, the if statement can be avoided using thunks
157  // (have both branches inside functions and have a pointer to the branch to be executed)
158  template <typename U = T, typename std::enable_if<!RColumnValue<U>::MustUseRVec_t::value, int>::type = 0>
159  T &Get(Long64_t entry)
160  {
161  if (fColumnKind == EColumnKind::kTree) {
162  return *(fTreeReader->Get());
163  } else {
164  fCustomColumn->Update(fSlot, entry);
165  return fColumnKind == EColumnKind::kCustomColumn ? *fCustomValuePtr : **fDSValuePtr;
166  }
167  }
168 
169  /// This overload is used to return arrays (i.e. types that are read into a RVec).
170  /// In this case the returned T is always a RVec<ColumnValue_t>.
171  /// RVec<bool> is treated differently, in a separate overload.
172  template <typename U = T,
173  typename std::enable_if<RColumnValue<U>::MustUseRVec_t::value && !std::is_same<U, RVec<bool>>::value,
174  int>::type = 0>
175  T &Get(Long64_t entry)
176  {
177  if (fColumnKind == EColumnKind::kTree) {
178  auto &readerArray = *fTreeReader;
179  // We only use TTreeReaderArrays to read columns that users flagged as type `RVec`, so we need to check
180  // that the branch stores the array as contiguous memory that we can actually wrap in an `RVec`.
181  // Currently we need the first entry to have been loaded to perform the check
182  // TODO Move check to `MakeProxy` once Axel implements this kind of check in TTreeReaderArray using
183  // TBranchProxy
184 
185  if (EStorageType::kUnknown == fStorageType && readerArray.GetSize() > 1) {
186  // We can decide since the array is long enough
187  fStorageType =
188  (1 == (&readerArray[1] - &readerArray[0])) ? EStorageType::kContiguous : EStorageType::kSparse;
189  }
190 
191  const auto readerArraySize = readerArray.GetSize();
192  if (EStorageType::kContiguous == fStorageType ||
193  (EStorageType::kUnknown == fStorageType && readerArray.GetSize() < 2)) {
194  if (readerArraySize > 0) {
195  // trigger loading of the contents of the TTreeReaderArray
196  // the address of the first element in the reader array is not necessarily equal to
197  // the address returned by the GetAddress method
198  auto readerArrayAddr = &readerArray.At(0);
199  T rvec(readerArrayAddr, readerArraySize);
200  std::swap(fRVec, rvec);
201  } else {
202  T emptyVec{};
203  std::swap(fRVec, emptyVec);
204  }
205  } else {
206  // The storage is not contiguous or we don't know yet: we cannot but copy into the rvec
207 #ifndef NDEBUG
208  if (!fCopyWarningPrinted) {
209  Warning("RColumnValue::Get",
210  "Branch %s hangs from a non-split branch. A copy is being performed in order "
211  "to properly read the content.",
212  readerArray.GetBranchName());
213  fCopyWarningPrinted = true;
214  }
215 #else
216  (void)fCopyWarningPrinted;
217 #endif
218  if (readerArraySize > 0) {
219  T rvec(readerArray.begin(), readerArray.end());
220  std::swap(fRVec, rvec);
221  } else {
222  T emptyVec{};
223  std::swap(fRVec, emptyVec);
224  }
225  }
226  return fRVec;
227 
228  } else {
229  fCustomColumn->Update(fSlot, entry);
230  return fColumnKind == EColumnKind::kCustomColumn ? *fCustomValuePtr : **fDSValuePtr;
231  }
232  }
233 
234  /// This overload covers the RVec<bool> case. In this case we always copy the contents of TTreeReaderArray<bool>
235  /// into RVec<bool> (never take a view into the memory buffer) because the underlying memory buffer might be the
236  /// one of a std::vector<bool>, which is not a contiguous slab of bool values.
237  /// Note that this also penalizes the case in which the column type is actually bool[], but the possible performance
238  /// gains in this edge case is probably not worth the extra complication required to differentiate the two cases.
239  template <typename U = T,
240  typename std::enable_if<RColumnValue<U>::MustUseRVec_t::value && std::is_same<U, RVec<bool>>::value,
241  int>::type = 0>
242  T &Get(Long64_t entry)
243  {
244  if (fColumnKind == EColumnKind::kTree) {
245  auto &readerArray = *fTreeReader;
246  const auto readerArraySize = readerArray.GetSize();
247  if (readerArraySize > 0) {
248  // always perform a copy
249  T rvec(readerArray.begin(), readerArray.end());
250  std::swap(fRVec, rvec);
251  } else {
252  T emptyVec{};
253  std::swap(fRVec, emptyVec);
254  }
255  return fRVec;
256  } else {
257  // business as usual
258  fCustomColumn->Update(fSlot, entry);
259  return fColumnKind == EColumnKind::kCustomColumn ? *fCustomValuePtr : **fDSValuePtr;
260  }
261  }
262 
263  void Reset()
264  {
265  // This method should by all means not be removed, together with all
266  // of its callers, otherwise a race condition takes place in which a
267  // TTreeReader and its TTreeReader{Value,Array}s could be deleted
268  // concurrently:
269  // - Thread #1) a task ends and pushes back processing slot
270  // - Thread #2) a task starts and overwrites thread-local TTreeReaderValues
271  // - Thread #1) first task deletes TTreeReader
272  // See https://github.com/root-project/root/commit/26e8ace6e47de6794ac9ec770c3bbff9b7f2e945
273  if (EColumnKind::kTree == fColumnKind) {
274  fTreeReader.reset();
275  }
276  }
277 };
278 
279 // Some extern instantiations to speed-up compilation/interpretation time
280 // These are not active if c++17 is enabled because of a bug in our clang
281 // See ROOT-9499.
282 #if __cplusplus < 201703L
283 extern template class RColumnValue<int>;
284 extern template class RColumnValue<unsigned int>;
285 extern template class RColumnValue<char>;
286 extern template class RColumnValue<unsigned char>;
287 extern template class RColumnValue<float>;
288 extern template class RColumnValue<double>;
289 extern template class RColumnValue<Long64_t>;
290 extern template class RColumnValue<ULong64_t>;
291 extern template class RColumnValue<std::vector<int>>;
292 extern template class RColumnValue<std::vector<unsigned int>>;
293 extern template class RColumnValue<std::vector<char>>;
294 extern template class RColumnValue<std::vector<unsigned char>>;
295 extern template class RColumnValue<std::vector<float>>;
296 extern template class RColumnValue<std::vector<double>>;
297 extern template class RColumnValue<std::vector<Long64_t>>;
298 extern template class RColumnValue<std::vector<ULong64_t>>;
299 #endif
300 
301 template <typename T>
302 struct TRDFValueTuple {
303 };
304 
305 template <typename... BranchTypes>
306 struct TRDFValueTuple<TypeList<BranchTypes...>> {
307  using type = std::tuple<RColumnValue<BranchTypes>...>;
308 };
309 
310 template <typename BranchType>
311 using RDFValueTuple_t = typename TRDFValueTuple<BranchType>::type;
312 
313 /// Clear the proxies of a tuple of RColumnValues
314 template <typename ValueTuple, std::size_t... S>
315 void ResetRDFValueTuple(ValueTuple &values, std::index_sequence<S...>)
316 {
317  // hack to expand a parameter pack without c++17 fold expressions.
318  std::initializer_list<int> expander{(std::get<S>(values).Reset(), 0)...};
319  (void)expander; // avoid "unused variable" warnings
320 }
321 
322 
323 } // ns RDF
324 } // ns Internal
325 } // ns ROOT
326 
327 #endif // ROOT_RCOLUMNVALUE