Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
RCsvDS.hxx
Go to the documentation of this file.
1 // Author: Enric Tejedor CERN 10/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RCSVTDS
12 #define ROOT_RCSVTDS
13 
14 #include "ROOT/RDataFrame.hxx"
15 #include "ROOT/RDataSource.hxx"
16 
17 #include <deque>
18 #include <list>
19 #include <map>
20 #include <vector>
21 
22 #include <TRegexp.h>
23 
24 namespace ROOT {
25 
26 namespace RDF {
27 
28 class RCsvDS final : public ROOT::RDF::RDataSource {
29 
30 private:
31  // Possible values are d, b, l, s. This is possible only because we treat double, bool, Long64_t and string
32  using ColType_t = char;
33  static const std::map<ColType_t, std::string> fgColTypeMap;
34 
35  std::streampos fDataPos = 0;
36  bool fReadHeaders = false;
37  unsigned int fNSlots = 0U;
38  std::ifstream fStream;
39  const char fDelimiter;
40  const Long64_t fLinesChunkSize;
41  ULong64_t fEntryRangesRequested = 0ULL;
42  ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
43  std::vector<std::string> fHeaders;
44  std::map<std::string, ColType_t> fColTypes;
45  std::list<ColType_t> fColTypesList;
46  std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot]
47  std::vector<Record_t> fRecords; // fRecords[entry][column]
48  std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
49  std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
50  std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
51  // This must be a deque to avoid the specialisation vector<bool>. This would not
52  // work given that the pointer to the boolean in that case cannot be taken
53  std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
54 
55  static TRegexp intRegex, doubleRegex1, doubleRegex2, doubleRegex3, trueRegex, falseRegex;
56 
57  void FillHeaders(const std::string &);
58  void FillRecord(const std::string &, Record_t &);
59  void GenerateHeaders(size_t);
60  std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &);
61  void InferColTypes(std::vector<std::string> &);
62  void InferType(const std::string &, unsigned int);
63  std::vector<std::string> ParseColumns(const std::string &);
64  size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
65  ColType_t GetType(std::string_view colName) const;
66 
67 protected:
68  std::string AsString();
69 
70 public:
71  RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL);
72  void Finalise();
73  void FreeRecords();
74  ~RCsvDS();
75  const std::vector<std::string> &GetColumnNames() const;
76  std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges();
77  std::string GetTypeName(std::string_view colName) const;
78  bool HasColumn(std::string_view colName) const;
79  bool SetEntry(unsigned int slot, ULong64_t entry);
80  void SetNSlots(unsigned int nSlots);
81  std::string GetLabel();
82 };
83 
84 ////////////////////////////////////////////////////////////////////////////////////////////////
85 /// \brief Factory method to create a CSV RDataFrame.
86 /// \param[in] fileName Path of the CSV file.
87 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
88 /// (default `true`).
89 /// \param[in] delimiter Delimiter character (default ',').
90 RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
91  Long64_t linesChunkSize = -1LL);
92 
93 } // ns RDF
94 
95 } // ns ROOT
96 
97 #endif