Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RDataSource.hxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RDATASOURCE
12 #define ROOT_RDATASOURCE
13 
14 #include "ROOT/RStringView.hxx"
15 #include "RtypesCore.h" // ULong64_t
16 #include "TString.h"
17 
18 #include <algorithm> // std::transform
19 #include <string>
20 #include <typeinfo>
21 #include <vector>
22 
23 namespace ROOT {
24 namespace RDF {
25 class RDataSource;
26 }
27 }
28 
29 /// Print a RDataSource at the prompt
30 namespace cling {
31 std::string printValue(ROOT::RDF::RDataSource *ds);
32 } // namespace cling
33 
34 namespace ROOT {
35 
36 namespace Internal {
37 namespace TDS {
38 
39 /// Mother class of TTypedPointerHolder. The instances
40 /// of this class can be put in a container. Upon destruction,
41 /// the correct deletion of the pointer is performed in the
42 /// derived class.
43 class TPointerHolder {
44 protected:
45  void *fPointer{nullptr};
46 
47 public:
48  TPointerHolder(void *ptr) : fPointer(ptr) {}
49  void *GetPointer() { return fPointer; }
50  void *GetPointerAddr() { return &fPointer; }
51  virtual TPointerHolder *GetDeepCopy() = 0;
52  virtual ~TPointerHolder(){};
53 };
54 
55 /// Class to wrap a pointer and delete the memory associated to it
56 /// correctly
57 template <typename T>
58 class TTypedPointerHolder final : public TPointerHolder {
59 public:
60  TTypedPointerHolder(T *ptr) : TPointerHolder((void *)ptr) {}
61 
62  virtual TPointerHolder *GetDeepCopy()
63  {
64  const auto typedPtr = static_cast<T *>(fPointer);
65  return new TTypedPointerHolder(new T(*typedPtr));
66  }
67 
68  ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
69 };
70 
71 } // ns TDS
72 } // ns Internal
73 
74 namespace RDF {
75 
76 // clang-format off
77 /**
78 \class ROOT::RDF::RDataSource
79 \ingroup dataframe
80 \brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
81 
82 A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
83 methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
84 RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
85 for selected columns and to advance the readers to the desired data entry.
86 
87 The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
88 
89  - SetNSlots() : inform RDataSource of the desired level of parallelism
90  - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
91  - Initialise() : inform RDataSource that an event-loop is about to start
92  - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
93  - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
94  - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
95  - FinaliseSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
96  - Finalise() : inform RDataSource that an event-loop finished
97 
98 RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
99  - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
100  - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
101  - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
102  - \b Initialise() and \b Finalise() are called once per event-loop, right before starting and right after finishing.
103  - \b InitSlot(), \b SetEntry(), and \b FinaliseSlot() can be called concurrently from multiple threads, multiple times per event-loop.
104 */
105 class RDataSource {
106  // clang-format on
107 protected:
108  using Record_t = std::vector<void *>;
109  friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
110 
111  virtual std::string AsString() { return "generic data source"; };
112 
113 public:
114  virtual ~RDataSource() = default;
115 
116  // clang-format off
117  /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
118  /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
119  /// pass different slot values when calling methods concurrently.
120  // clang-format on
121  virtual void SetNSlots(unsigned int nSlots) = 0;
122 
123  // clang-format off
124  /// \brief Returns a reference to the collection of the dataset's column names
125  // clang-format on
126  virtual const std::vector<std::string> &GetColumnNames() const = 0;
127 
128  /// \brief Checks if the dataset has a certain column
129  /// \param[in] columnName The name of the column
130  virtual bool HasColumn(std::string_view) const = 0;
131 
132  // clang-format off
133  /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
134  /// \param[in] columnName The name of the column
135  // clang-format on
136  virtual std::string GetTypeName(std::string_view) const = 0;
137 
138  // clang-format off
139  /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
140  /// \tparam T The type of the data stored in the column
141  /// \param[in] columnName The name of the column
142  ///
143  /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
144  /// the "right" memory region.
145  // clang-format on
146  template <typename T>
147  std::vector<T **> GetColumnReaders(std::string_view columnName)
148  {
149  auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
150  std::vector<T **> typedVec(typeErasedVec.size());
151  std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
152  [](void *p) { return static_cast<T **>(p); });
153  return typedVec;
154  }
155 
156  // clang-format off
157  /// \brief Return ranges of entries to distribute to tasks.
158  /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
159  /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
160  /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
161  /// The same entry range should not be returned more than once.
162  /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
163  // clang-format on
164  virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
165 
166  // clang-format off
167  /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
168  /// \param[in] slot The data processing slot that needs to be considered
169  /// \param[in] entry The entry which needs to be pointed to by the reader pointers
170  /// Slots are adopted to accommodate parallel data processing.
171  /// Different workers will loop over different ranges and
172  /// will be labelled by different "slot" values.
173  /// Returns *true* if the entry has to be processed, *false* otherwise.
174  // clang-format on
175  virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
176 
177  // clang-format off
178  /// \brief Convenience method called before starting an event-loop.
179  /// This method might be called multiple times over the lifetime of a RDataSource, since
180  /// users can run multiple event-loops with the same RDataFrame.
181  /// Ideally, `Initialise` should set the state of the RDataSource so that multiple identical event-loops
182  /// will produce identical results.
183  // clang-format on
184  virtual void Initialise() {}
185 
186  // clang-format off
187  /// \brief Convenience method called at the start of the data processing associated to a slot.
188  /// \param[in] slot The data processing slot wihch needs to be initialised
189  /// \param[in] firstEntry The first entry of the range that the task will process.
190  /// This method might be called multiple times per thread per event-loop.
191  // clang-format on
192  virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
193 
194  // clang-format off
195  /// \brief Convenience method called at the end of the data processing associated to a slot.
196  /// \param[in] slot The data processing slot wihch needs to be finalised
197  /// This method might be called multiple times per thread per event-loop.
198  // clang-format on
199  virtual void FinaliseSlot(unsigned int /*slot*/) {}
200 
201  // clang-format off
202  /// \brief Convenience method called after concluding an event-loop.
203  /// See Initialise for more details.
204  // clang-format on
205  virtual void Finalise() {}
206 
207  /// \brief Return a string representation of the datasource type.
208  /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
209  /// the datasource in the visualization of the computation graph.
210  /// Concrete datasources can override the default implementation.
211  virtual std::string GetLabel() { return "Custom Datasource"; }
212 
213 protected:
214  /// type-erased vector of pointers to pointers to column values - one per slot
215  virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
216 };
217 
218 } // ns RDF
219 
220 } // ns ROOT
221 
222 /// Print a RDataSource at the prompt
223 namespace cling {
224 inline std::string printValue(ROOT::RDF::RDataSource *ds)
225 {
226  return ds->AsString();
227 }
228 } // namespace cling
229 
230 #endif // ROOT_TDATASOURCE