C++ API Reference for Intel® Data Analytics Acceleration Library 2019 Update 4

csv_feature_utils.h
1 /* file: csv_feature_utils.h */
2 /*******************************************************************************
3 * Copyright 2014-2019 Intel Corporation.
4 *
5 * This software and the related documents are Intel copyrighted materials, and
6 * your use of them is governed by the express license under which they were
7 * provided to you (License). Unless the License provides otherwise, you may not
8 * use, modify, copy, publish, distribute, disclose or transmit this software or
9 * the related documents without Intel's prior written permission.
10 *
11 * This software and the related documents are provided as is, with no express
12 * or implied warranties, other than those that are expressly stated in the
13 * License.
14 *******************************************************************************/
15 
16 #ifndef __CSV_FEATURE_UTILS_H__
17 #define __CSV_FEATURE_UTILS_H__
18 
19 #include <sstream>
20 
21 #include "services/collection.h"
22 #include "services/daal_string.h"
23 #include "data_management/features/defines.h"
24 
25 namespace daal
26 {
27 namespace data_management
28 {
29 namespace internal
30 {
31 
37 class CSVRowTokenizer : public Base
38 {
39 private:
40  char *_rawData;
41  const size_t _rawDataSize;
42  const char _delimiter;
43 
44  size_t _pos;
45  size_t _prevPos;
46  size_t _tokenSize;
47  bool _goodFlag;
48 
49 public:
50  explicit CSVRowTokenizer(char *rawData, size_t rawDataSize, char delimiter) :
51  _rawData(rawData),
52  _rawDataSize(rawDataSize),
53  _delimiter(delimiter),
54  _pos(0),
55  _prevPos(0),
56  _tokenSize(0),
57  _goodFlag(true) { }
58 
59  void reset()
60  {
61  _pos = 0;
62  _prevPos = 0;
63  _tokenSize = 0;
64  _goodFlag = true;
65 
66  next();
67  }
68 
69  DAAL_FORCEINLINE void next()
70  {
71  /* We assume _rawData is single line of CSV file and
72  * has a termination character in the end */
73 
74  if (!good()) { return; }
75 
76  _prevPos = _pos;
77 
78  while (isValidSymbol(_pos) && !isStopSymbol(_pos))
79  { _pos++; }
80 
81  _tokenSize = _pos - _prevPos;
82  _goodFlag = isValidSymbol(_prevPos);
83 
84  if (isValidSymbol(_pos) && isStopSymbol(_pos))
85  {
86  _rawData[_pos] = '\0';
87  _pos++;
88  }
89  }
90 
91  DAAL_FORCEINLINE bool good() const
92  {
93  return _goodFlag;
94  }
95 
96  DAAL_FORCEINLINE services::StringView getCurrentToken() const
97  {
98  return services::StringView(_rawData + _prevPos, _tokenSize);
99  }
100 
101 private:
102  DAAL_FORCEINLINE bool isValidSymbol(size_t index) const
103  {
104  return index < _rawDataSize &&
105  _rawData[index] != '\0';
106  }
107 
108  DAAL_FORCEINLINE bool isStopSymbol(size_t index) const
109  {
110  return _rawData[index] == _delimiter;
111  }
112 
113  CSVRowTokenizer(const CSVRowTokenizer &);
114  CSVRowTokenizer &operator=(const CSVRowTokenizer &);
115 };
116 
121 class CSVFeaturesInfo : public Base
122 {
123 public:
124  services::Status addFeatureName(const services::StringView &featureName)
125  {
126  const services::String featureNameStr(featureName.begin(), featureName.end());
127  if ( !_featureNames.safe_push_back(featureNameStr) )
128  {
129  return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
130  }
131  return services::Status();
132  }
133 
134  services::Status addFeatureType(const services::StringView &token)
135  {
136  const features::FeatureType featureType = detectFeatureType(token);
137  if ( !_featureTypes.safe_push_back(featureType) )
138  {
139  return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
140  }
141  return services::Status();
142  }
143 
144  size_t getNumberOfFeatures() const
145  {
146  /* We allow _featureNames to be empty to support a no-header case */
147  if (_featureNames.size() != 0)
148  {
149  DAAL_ASSERT( _featureNames.size() == _featureTypes.size() );
150  return _featureNames.size();
151  }
152  return _featureTypes.size();
153  }
154 
155  const services::String &getFeatureName(size_t featureIndex) const
156  {
157  DAAL_ASSERT( _featureNames.size() == 0 ||
158  _featureNames.size() == _featureTypes.size() );
159  DAAL_ASSERT( featureIndex < _featureNames.size() );
160  return _featureNames[featureIndex];
161  }
162 
163  features::FeatureType getDetectedFeatureType(size_t featureIndex) const
164  {
165  DAAL_ASSERT( featureIndex < _featureTypes.size() );
166  return _featureTypes[featureIndex];
167  }
168 
169  bool areFeatureNamesAvailable() const
170  {
171  return _featureNames.size() > 0;
172  }
173 
174 private:
175  static features::FeatureType detectFeatureType(const services::StringView &token)
176  {
177  return isNumericalFeature(token)
178  ? features::DAAL_CONTINUOUS
179  : features::DAAL_CATEGORICAL;
180  }
181 
182  static bool isNumericalFeature(const services::StringView &token)
183  {
184  std::istringstream iss(token.c_str());
185  DAAL_DATA_TYPE f = 0.0; iss >> f;
186  return !(iss.fail());
187  }
188 
189 private:
190  services::Collection<services::String> _featureNames;
191  services::Collection<features::FeatureType> _featureTypes;
192 };
193 
194 } // namespace internal
195 } // namespace data_management
196 } // namespace daal
197 
198 #endif
daal::data_management::internal::CSVFeaturesInfo
Class that holds auxiliary information about features being parsed.
Definition: csv_feature_utils.h:121
daal
Definition: algorithm_base_common.h:31
daal::services::ErrorMemoryAllocationFailed
Definition: error_indexes.h:146
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::Base
Base class for Intel(R) Data Analytics Acceleration Library objects
Definition: base.h:39

For more complete information about compiler optimizations, see our Optimization Notice.