Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RNTupleDescriptor.hxx
Go to the documentation of this file.
1 /// \file ROOT/RNTupleDescriptor.hxx
2 /// \ingroup NTuple ROOT7
3 /// \author Jakob Blomer <jblomer@cern.ch>
4 /// \date 2018-07-19
5 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6 /// is welcome!
7 
8 /*************************************************************************
9  * Copyright (C) 1995-2019, Rene Brun and Fons Rademakers. *
10  * All rights reserved. *
11  * *
12  * For the licensing terms see $ROOTSYS/LICENSE. *
13  * For the list of contributors see $ROOTSYS/README/CREDITS. *
14  *************************************************************************/
15 
16 #ifndef ROOT7_RNTupleDescriptor
17 #define ROOT7_RNTupleDescriptor
18 
19 #include <ROOT/RColumnModel.hxx>
20 #include <ROOT/RNTupleUtil.hxx>
21 #include <ROOT/RStringView.hxx>
22 
23 #include <chrono>
24 #include <memory>
25 #include <ostream>
26 #include <vector>
27 #include <string>
28 #include <unordered_map>
29 
30 namespace ROOT {
31 namespace Experimental {
32 
33 class RNTupleDescriptorBuilder;
34 class RNTupleModel;
35 
36 // clang-format off
37 /**
38 \class ROOT::Experimental::RFieldDescriptor
39 \ingroup NTuple
40 \brief Meta-data stored for every field of an ntuple
41 */
42 // clang-format on
43 class RFieldDescriptor {
44  friend class RNTupleDescriptorBuilder;
45 
46 private:
47  DescriptorId_t fFieldId = kInvalidDescriptorId;
48  /// The version of the C++-type-to-column translation mechanics
49  RNTupleVersion fFieldVersion;
50  /// The version of the C++ type itself
51  RNTupleVersion fTypeVersion;
52  /// The leaf name, not including parent fields
53  std::string fFieldName;
54  /// Free text set by the user
55  std::string fFieldDescription;
56  /// The C++ type that was used when writing the field
57  std::string fTypeName;
58  /// The number of elements per entry for fixed-size arrays
59  std::uint64_t fNRepetitions;
60  /// The structural information carried by this field in the data model tree
61  ENTupleStructure fStructure;
62  /// Establishes sub field relationships, such as classes and collections
63  DescriptorId_t fParentId = kInvalidDescriptorId;
64  /// The pointers in the other direction from parent to children. They are serialized, too, to keep the
65  /// order of sub fields.
66  std::vector<DescriptorId_t> fLinkIds;
67 
68 public:
69  /// In order to handle changes to the serialization routine in future ntuple versions
70  static constexpr std::uint16_t kFrameVersionCurrent = 0;
71  static constexpr std::uint16_t kFrameVersionMin = 0;
72 
73  RFieldDescriptor() = default;
74  RFieldDescriptor(const RFieldDescriptor &other) = delete;
75  RFieldDescriptor &operator =(const RFieldDescriptor &other) = delete;
76  RFieldDescriptor(RFieldDescriptor &&other) = default;
77  RFieldDescriptor &operator =(RFieldDescriptor &&other) = default;
78 
79  bool operator==(const RFieldDescriptor &other) const;
80 
81  DescriptorId_t GetId() const { return fFieldId; }
82  RNTupleVersion GetFieldVersion() const { return fFieldVersion; }
83  RNTupleVersion GetTypeVersion() const { return fTypeVersion; }
84  std::string GetFieldName() const { return fFieldName; }
85  std::string GetFieldDescription() const { return fFieldDescription; }
86  std::string GetTypeName() const { return fTypeName; }
87  std::uint64_t GetNRepetitions() const { return fNRepetitions; }
88  ENTupleStructure GetStructure() const { return fStructure; }
89  DescriptorId_t GetParentId() const { return fParentId; }
90  const std::vector<DescriptorId_t> &GetLinkIds() const { return fLinkIds; }
91 };
92 
93 
94 // clang-format off
95 /**
96 \class ROOT::Experimental::RColumnDescriptor
97 \ingroup NTuple
98 \brief Meta-data stored for every column of an ntuple
99 */
100 // clang-format on
101 class RColumnDescriptor {
102  friend class RNTupleDescriptorBuilder;
103 
104 private:
105  DescriptorId_t fColumnId = kInvalidDescriptorId;
106  /// Versions can change, e.g., when new column types are added
107  RNTupleVersion fVersion;
108  /// Contains the column type and whether it is sorted
109  RColumnModel fModel;
110  /// Every column belongs to one and only one field
111  DescriptorId_t fFieldId = kInvalidDescriptorId;
112  /// A field can be serialized into several columns, which are numbered from zero to $n$
113  std::uint32_t fIndex;
114 
115 public:
116  /// In order to handle changes to the serialization routine in future ntuple versions
117  static constexpr std::uint16_t kFrameVersionCurrent = 0;
118  static constexpr std::uint16_t kFrameVersionMin = 0;
119 
120  RColumnDescriptor() = default;
121  RColumnDescriptor(const RColumnDescriptor &other) = delete;
122  RColumnDescriptor &operator =(const RColumnDescriptor &other) = delete;
123  RColumnDescriptor(RColumnDescriptor &&other) = default;
124  RColumnDescriptor &operator =(RColumnDescriptor &&other) = default;
125 
126  bool operator==(const RColumnDescriptor &other) const;
127 
128  DescriptorId_t GetId() const { return fColumnId; }
129  RNTupleVersion GetVersion() const { return fVersion; }
130  RColumnModel GetModel() const { return fModel; }
131  std::uint32_t GetIndex() const { return fIndex; }
132  DescriptorId_t GetFieldId() const { return fFieldId; }
133 };
134 
135 
136 // clang-format off
137 /**
138 \class ROOT::Experimental::RClusterDescriptor
139 \ingroup NTuple
140 \brief Meta-data for a set of ntuple clusters
141 
142 The cluster descriptor might carry information of only a subset of available clusters, for instance if multiple
143 files are chained and not all of them have been processed yet.
144 */
145 // clang-format on
146 class RClusterDescriptor {
147  friend class RNTupleDescriptorBuilder;
148 
149 public:
150  /// Generic information about the physical location of data. Values depend on the concrete storage type. E.g.,
151  /// for a local file fUrl might be unsused and fPosition might be a file offset. Objects on storage can be compressed
152  /// and therefore we need to store their actual size.
153  struct RLocator {
154  std::int64_t fPosition = 0;
155  std::uint32_t fBytesOnStorage = 0;
156  std::string fUrl;
157 
158  bool operator==(const RLocator &other) const {
159  return fPosition == other.fPosition && fBytesOnStorage == other.fBytesOnStorage && fUrl == other.fUrl;
160  }
161  };
162 
163  /// The window of element indexes of a particular column in a particular cluster
164  struct RColumnRange {
165  DescriptorId_t fColumnId = kInvalidDescriptorId;
166  /// A 64bit element index
167  NTupleSize_t fFirstElementIndex = kInvalidNTupleIndex;
168  /// A 32bit value for the number of column elements in the cluster
169  ClusterSize_t fNElements = kInvalidClusterIndex;
170  /// The usual format for ROOT compression settings (see Compression.h).
171  /// The pages of a particular column in a particular cluster are all compressed with the same settings.
172  std::int64_t fCompressionSettings = 0;
173 
174  // TODO(jblomer): we perhaps want to store summary information, such as average, min/max, etc.
175  // Should this be done on the field level?
176 
177  bool operator==(const RColumnRange &other) const {
178  return fColumnId == other.fColumnId && fFirstElementIndex == other.fFirstElementIndex &&
179  fNElements == other.fNElements && fCompressionSettings == other.fCompressionSettings;
180  }
181 
182  bool Contains(NTupleSize_t index) const {
183  return (fFirstElementIndex <= index && (fFirstElementIndex + fNElements) > index);
184  }
185  };
186 
187  /// Records the parition of data into pages for a particular column in a particular cluster
188  struct RPageRange {
189  /// We do not need to store the element size / uncompressed page size because we know to which column
190  /// the page belongs
191  struct RPageInfo {
192  /// The sum of the elements of all the pages must match the corresponding fNElements field in fColumnRanges
193  ClusterSize_t fNElements = kInvalidClusterIndex;
194  /// The meaning of fLocator depends on the storage backend.
195  RLocator fLocator;
196 
197  bool operator==(const RPageInfo &other) const {
198  return fNElements == other.fNElements && fLocator == other.fLocator;
199  }
200  };
201 
202  RPageRange() = default;
203  RPageRange(const RPageRange &other) = delete;
204  RPageRange &operator =(const RPageRange &other) = delete;
205  RPageRange(RPageRange &&other) = default;
206  RPageRange &operator =(RPageRange &&other) = default;
207 
208  DescriptorId_t fColumnId = kInvalidDescriptorId;
209  std::vector<RPageInfo> fPageInfos;
210 
211  bool operator==(const RPageRange &other) const {
212  return fColumnId == other.fColumnId && fPageInfos == other.fPageInfos;
213  }
214  };
215 
216 private:
217  DescriptorId_t fClusterId = kInvalidDescriptorId;
218  /// Future versions of the cluster descriptor might add more meta-data, e.g. a semantic checksum
219  RNTupleVersion fVersion;
220  /// Clusters can be swapped by adjusting the entry offsets
221  NTupleSize_t fFirstEntryIndex = kInvalidNTupleIndex;
222  ClusterSize_t fNEntries = kInvalidClusterIndex;
223  /// For pre-fetching / caching an entire contiguous cluster
224  RLocator fLocator;
225 
226  std::unordered_map<DescriptorId_t, RColumnRange> fColumnRanges;
227  std::unordered_map<DescriptorId_t, RPageRange> fPageRanges;
228 
229 public:
230  /// In order to handle changes to the serialization routine in future ntuple versions
231  static constexpr std::uint16_t kFrameVersionCurrent = 0;
232  static constexpr std::uint16_t kFrameVersionMin = 0;
233 
234  RClusterDescriptor() = default;
235  RClusterDescriptor(const RClusterDescriptor &other) = delete;
236  RClusterDescriptor &operator =(const RClusterDescriptor &other) = delete;
237  RClusterDescriptor(RClusterDescriptor &&other) = default;
238  RClusterDescriptor &operator =(RClusterDescriptor &&other) = default;
239 
240  bool operator==(const RClusterDescriptor &other) const;
241 
242  DescriptorId_t GetId() const { return fClusterId; }
243  RNTupleVersion GetVersion() const { return fVersion; }
244  NTupleSize_t GetFirstEntryIndex() const { return fFirstEntryIndex; }
245  ClusterSize_t GetNEntries() const { return fNEntries; }
246  RLocator GetLocator() const { return fLocator; }
247  const RColumnRange &GetColumnRange(DescriptorId_t columnId) const { return fColumnRanges.at(columnId); }
248  const RPageRange &GetPageRange(DescriptorId_t columnId) const { return fPageRanges.at(columnId); }
249 };
250 
251 
252 // clang-format off
253 /**
254 \class ROOT::Experimental::RNTupleDescriptor
255 \ingroup NTuple
256 \brief The on-storage meta-data of an ntuple
257 
258 Represents the on-disk (on storage) information about an ntuple. The meta-data consists of a header and one or
259 several footers. The header carries the ntuple schema, i.e. the fields and the associated columns and their
260 relationships. The footer(s) carry information about one or several clusters. For every cluster, a footer stores
261 its location and size, and for every column the range of element indexes as well as a list of pages and page
262 locations.
263 
264 The descriptor provide machine-independent (de-)serialization of headers and footers, and it provides lookup routines
265 for ntuple objects (pages, clusters, ...). It is supposed to be usable by all RPageStorage implementations.
266 
267 The serialization does not use standard ROOT streamers in order to not let it depend on libCore. The serialization uses
268 the concept of frames: header, footer, and substructures have a preamble with version numbers and the size of the
269 writte struct. This allows for forward and backward compatibility when the meta-data evolves.
270 */
271 // clang-format on
272 class RNTupleDescriptor {
273  friend class RNTupleDescriptorBuilder;
274 
275 private:
276  /// The ntuple name needs to be unique in a given storage location (file)
277  std::string fName;
278  /// Free text from the user
279  std::string fDescription;
280  /// The origin of the data
281  std::string fAuthor;
282  /// The current responsible for storing the data
283  std::string fCustodian;
284  /// The time stamp of the ntuple data (immutable)
285  std::chrono::system_clock::time_point fTimeStampData;
286  /// The time stamp of writing the data to storage, which gets updated when re-written
287  std::chrono::system_clock::time_point fTimeStampWritten;
288  /// The version evolves with the ntuple summary meta-data
289  RNTupleVersion fVersion;
290  /// Every NTuple gets a unique identifier
291  RNTupleUuid fOwnUuid;
292  /// Column sets that are created as derived sets from existing NTuples share the same group id.
293  /// NTuples in the same group have the same number of entries and are supposed to contain associated data.
294  RNTupleUuid fGroupUuid;
295 
296  std::unordered_map<DescriptorId_t, RFieldDescriptor> fFieldDescriptors;
297  std::unordered_map<DescriptorId_t, RColumnDescriptor> fColumnDescriptors;
298  /// May contain only a subset of all the available clusters, e.g. the clusters of the current file
299  /// from a chain of files
300  std::unordered_map<DescriptorId_t, RClusterDescriptor> fClusterDescriptors;
301 
302 public:
303  /// In order to handle changes to the serialization routine in future ntuple versions
304  static constexpr std::uint16_t kFrameVersionCurrent = 0;
305  static constexpr std::uint16_t kFrameVersionMin = 0;
306  /// The preamble is sufficient to get the length of the header
307  static constexpr unsigned int kNBytesPreamble = 8;
308  /// The last few bytes after the footer store the length of footer and header
309  static constexpr unsigned int kNBytesPostscript = 16;
310 
311  RNTupleDescriptor() = default;
312  RNTupleDescriptor(const RNTupleDescriptor &other) = delete;
313  RNTupleDescriptor &operator=(const RNTupleDescriptor &other) = delete;
314  RNTupleDescriptor(RNTupleDescriptor &&other) = default;
315  RNTupleDescriptor &operator=(RNTupleDescriptor &&other) = default;
316 
317  bool operator ==(const RNTupleDescriptor &other) const;
318 
319  /// We deliberately do not use ROOT's built-in serialization in order to allow for use of RNTuple's without libCore
320  /// Serializes the global ntuple information as well as the column and field schemata
321  /// Returns the number of bytes and fills buffer if it is not nullptr.
322  std::uint32_t SerializeHeader(void* buffer) const;
323  /// Serializes cluster meta data. Returns the number of bytes and fills buffer if it is not nullptr.
324  std::uint32_t SerializeFooter(void* buffer) const;
325  /// Given kNBytesPostscript bytes, extract the header and footer lengths in bytes
326  static void LocateMetadata(const void *postscript, std::uint32_t &szHeader, std::uint32_t &szFooter);
327 
328  const RFieldDescriptor& GetFieldDescriptor(DescriptorId_t fieldId) const { return fFieldDescriptors.at(fieldId); }
329  const RColumnDescriptor& GetColumnDescriptor(DescriptorId_t columnId) const {
330  return fColumnDescriptors.at(columnId);
331  }
332  const RClusterDescriptor& GetClusterDescriptor(DescriptorId_t clusterId) const {
333  return fClusterDescriptors.at(clusterId);
334  }
335  std::string GetName() const { return fName; }
336  std::string GetDescription() const { return fDescription; }
337  std::string GetAuthor() const { return fAuthor; }
338  std::string GetCustodian() const { return fCustodian; }
339  std::chrono::system_clock::time_point GetTimeStampData() const { return fTimeStampData; }
340  std::chrono::system_clock::time_point GetTimeStampWritten() const { return fTimeStampWritten; }
341  RNTupleVersion GetVersion() const { return fVersion; }
342  RNTupleUuid GetOwnUuid() const { return fOwnUuid; }
343  RNTupleUuid GetGroupUuid() const { return fGroupUuid; }
344 
345  std::size_t GetNFields() const { return fFieldDescriptors.size(); }
346  std::size_t GetNColumns() const { return fColumnDescriptors.size(); }
347  std::size_t GetNClusters() const { return fClusterDescriptors.size(); }
348 
349  // The number of entries as seen with the currently loaded cluster meta-data; there might be more
350  NTupleSize_t GetNEntries() const;
351  NTupleSize_t GetNElements(DescriptorId_t columnId) const;
352 
353  DescriptorId_t FindFieldId(std::string_view fieldName, DescriptorId_t parentId) const;
354  /// Searches for a top-level field
355  DescriptorId_t FindFieldId(std::string_view fieldName) const;
356  DescriptorId_t FindColumnId(DescriptorId_t fieldId, std::uint32_t columnIndex) const;
357  DescriptorId_t FindClusterId(DescriptorId_t columnId, NTupleSize_t index) const;
358 
359  /// Re-create the C++ model from the stored meta-data
360  std::unique_ptr<RNTupleModel> GenerateModel() const;
361  void PrintInfo(std::ostream &output) const;
362 };
363 
364 
365 // clang-format off
366 /**
367 \class ROOT::Experimental::RNTupleDescriptorBuilder
368 \ingroup NTuple
369 \brief A helper class for piece-wise construction of an RNTupleDescriptor
370 
371 Used by RPageStorage implementations in order to construct the RNTupleDescriptor from the various header parts.
372 */
373 // clang-format on
374 class RNTupleDescriptorBuilder {
375 private:
376  RNTupleDescriptor fDescriptor;
377 
378 public:
379  bool IsValid() const { return true; /* TODO(jblomer) */}
380  const RNTupleDescriptor& GetDescriptor() const { return fDescriptor; }
381  RNTupleDescriptor MoveDescriptor();
382 
383  void SetNTuple(const std::string_view name, const std::string_view description, const std::string_view author,
384  const RNTupleVersion &version, const RNTupleUuid &uuid);
385 
386  void AddField(DescriptorId_t fieldId, const RNTupleVersion &fieldVersion, const RNTupleVersion &typeVersion,
387  std::string_view fieldName, std::string_view typeName, std::uint64_t nRepetitions,
388  ENTupleStructure structure);
389  void AddFieldLink(DescriptorId_t fieldId, DescriptorId_t linkId);
390 
391  void AddColumn(DescriptorId_t columnId, DescriptorId_t fieldId,
392  const RNTupleVersion &version, const RColumnModel &model, std::uint32_t index);
393 
394  void SetFromHeader(void* headerBuffer);
395 
396  void AddCluster(DescriptorId_t clusterId, RNTupleVersion version,
397  NTupleSize_t firstEntryIndex, ClusterSize_t nEntries);
398  void SetClusterLocator(DescriptorId_t clusterId, RClusterDescriptor::RLocator locator);
399  void AddClusterColumnRange(DescriptorId_t clusterId, const RClusterDescriptor::RColumnRange &columnRange);
400  void AddClusterPageRange(DescriptorId_t clusterId, RClusterDescriptor::RPageRange &&pageRange);
401 
402  void AddClustersFromFooter(void* footerBuffer);
403 };
404 
405 } // namespace Experimental
406 } // namespace ROOT
407 
408 #endif