16 #ifndef __CSV_FEATURE_UTILS_H__
17 #define __CSV_FEATURE_UTILS_H__
21 #include "services/collection.h"
22 #include "services/daal_string.h"
23 #include "data_management/features/defines.h"
27 namespace data_management
37 class CSVRowTokenizer :
public Base
41 const size_t _rawDataSize;
42 const char _delimiter;
50 explicit CSVRowTokenizer(
char *rawData,
size_t rawDataSize,
char delimiter) :
52 _rawDataSize(rawDataSize),
53 _delimiter(delimiter),
69 DAAL_FORCEINLINE
void next()
74 if (!good()) {
return; }
78 while (isValidSymbol(_pos) && !isStopSymbol(_pos))
81 _tokenSize = _pos - _prevPos;
82 _goodFlag = isValidSymbol(_prevPos);
84 if (isValidSymbol(_pos) && isStopSymbol(_pos))
86 _rawData[_pos] =
'\0';
91 DAAL_FORCEINLINE
bool good()
const
96 DAAL_FORCEINLINE services::StringView getCurrentToken()
const
98 return services::StringView(_rawData + _prevPos, _tokenSize);
102 DAAL_FORCEINLINE
bool isValidSymbol(
size_t index)
const
104 return index < _rawDataSize &&
105 _rawData[index] !=
'\0';
108 DAAL_FORCEINLINE
bool isStopSymbol(
size_t index)
const
110 return _rawData[index] == _delimiter;
113 CSVRowTokenizer(
const CSVRowTokenizer &);
114 CSVRowTokenizer &operator=(
const CSVRowTokenizer &);
121 class CSVFeaturesInfo :
public Base
124 services::Status addFeatureName(
const services::StringView &featureName)
126 const services::String featureNameStr(featureName.begin(), featureName.end());
127 if ( !_featureNames.safe_push_back(featureNameStr) )
129 return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
131 return services::Status();
134 services::Status addFeatureType(
const services::StringView &token)
136 const features::FeatureType featureType = detectFeatureType(token);
137 if ( !_featureTypes.safe_push_back(featureType) )
139 return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
141 return services::Status();
144 size_t getNumberOfFeatures()
const
147 if (_featureNames.size() != 0)
149 DAAL_ASSERT( _featureNames.size() == _featureTypes.size() );
150 return _featureNames.size();
152 return _featureTypes.size();
155 const services::String &getFeatureName(
size_t featureIndex)
const
157 DAAL_ASSERT( _featureNames.size() == 0 ||
158 _featureNames.size() == _featureTypes.size() );
159 DAAL_ASSERT( featureIndex < _featureNames.size() );
160 return _featureNames[featureIndex];
163 features::FeatureType getDetectedFeatureType(
size_t featureIndex)
const
165 DAAL_ASSERT( featureIndex < _featureTypes.size() );
166 return _featureTypes[featureIndex];
169 bool areFeatureNamesAvailable()
const
171 return _featureNames.size() > 0;
175 static features::FeatureType detectFeatureType(
const services::StringView &token)
177 return isNumericalFeature(token)
178 ? features::DAAL_CONTINUOUS
179 : features::DAAL_CATEGORICAL;
182 static bool isNumericalFeature(
const services::StringView &token)
184 std::istringstream iss(token.c_str());
185 DAAL_DATA_TYPE f = 0.0; iss >> f;
186 return !(iss.fail());
190 services::Collection<services::String> _featureNames;
191 services::Collection<features::FeatureType> _featureTypes;
daal::data_management::internal::CSVFeaturesInfo
Class that holds auxiliary information about features being parsed.
Definition: csv_feature_utils.h:121
daal
Definition: algorithm_base_common.h:31
daal::services::ErrorMemoryAllocationFailed
Definition: error_indexes.h:146
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::Base
Base class for Intel(R) Data Analytics Acceleration Library objects
Definition: base.h:39