22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
25 #include "data_management/data/numeric_table.h"
26 #include "data_management/features/shortcuts.h"
27 #include "data_management/data_source/data_source.h"
28 #include "data_management/data_source/internal/csv_feature_utils.h"
29 #include "data_management/data_source/modifiers/csv/shortcuts.h"
30 #include "data_management/data_source/modifiers/csv/internal/engine.h"
34 namespace data_management
45 dsFeat(0), ntFeat(0), nCats(0) { }
47 explicit FeatureAuxData(
size_t index,
48 DataSourceFeature *dataSourceFeature,
49 NumericTableFeature *numericTableFeature) :
51 dsFeat(dataSourceFeature),
52 ntFeat(numericTableFeature),
53 wide(1), nCats(0), buffer() { }
58 DataSourceFeature *dsFeat;
59 NumericTableFeature *ntFeat;
63 typedef void (*functionT)(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
72 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const = 0;
74 virtual ~ModifierIface() {}
76 static void contFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
79 readNumeric<>( word, f );
83 static void catFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
85 aux.buffer.assign(word);
87 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
88 CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
90 if( it != catDict->end() )
92 arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
97 int index = (int)(catDict->size());
98 catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>(index, 1) ) );
99 arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
100 aux.ntFeat->categoryNumber = index + 1;
104 static void nullFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
108 static void readNumeric(
const char *text, T &f)
110 f = daal::services::daal_string_to_float(text, 0);
113 static void binFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
115 aux.buffer.assign(word);
117 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
118 CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
122 if( it != catDict->end() )
124 index = it->second.first;
129 index = catDict->size();
130 catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>((
int)index, 1) ) );
131 aux.ntFeat->categoryNumber = index + 1;
134 size_t nCats = aux.nCats;
136 for(
size_t i=0; i<nCats; i++)
138 arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
147 class MakeCategorical :
public ModifierIface
151 MakeCategorical(
size_t idx) : idx(idx) {}
153 virtual ~MakeCategorical() {}
155 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
157 size_t nCols = funcList.size();
161 funcList[idx] = catFunc;
162 auxVect[idx].buffer.resize(1024);
171 class OneHotEncoder :
public ModifierIface
176 OneHotEncoder(
size_t idx,
size_t nCats) : idx(idx), nCats(nCats) {}
178 virtual ~OneHotEncoder() {}
180 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
182 size_t nCols = funcList.size();
186 funcList[idx] = binFunc;
187 auxVect[idx].buffer.resize(1024);
188 auxVect[idx].nCats = nCats;
189 auxVect[idx].wide = nCats;
193 for(
size_t i=0; i<nCols; i++)
195 auxVect[i].idx = nNTCols;
196 nNTCols += auxVect[i].wide;
205 class ColumnFilter :
public ModifierIface
211 services::Collection<size_t> validList;
213 ColumnFilter() : oddFlag(
false), evenFlag(
false), noneFlag(
false), listFlag(
false) {}
215 virtual ~ColumnFilter() {}
217 ColumnFilter& odd() { oddFlag=
true;
return *
this;}
218 ColumnFilter& even() {evenFlag=
true;
return *
this;}
219 ColumnFilter& none() {noneFlag=
true;
return *
this;}
220 ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=
true;
return *
this;}
222 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
224 size_t nCols = funcList.size();
228 for(
size_t i=0; i<nCols; i+=2)
230 funcList[i] = nullFunc;
237 for(
size_t i=1; i<nCols; i+=2)
239 funcList[i] = nullFunc;
246 for(
size_t i=0; i<nCols; i++)
248 funcList[i] = nullFunc;
255 services::Collection<bool> flags(nCols);
257 for(
size_t i=0; i<nCols; i++)
262 for(
size_t i=0; i<validList.size(); i++)
264 size_t el = validList[i];
271 for(
size_t i=0; i<nCols; i++)
273 if(flags[i])
continue;
274 funcList[i] = nullFunc;
280 for(
size_t i=0; i<nCols; i++)
282 auxVect[i].idx = nNTCols;
283 nNTCols += auxVect[i].wide;
301 class CSVFeatureManager :
public StringRowFeatureManagerIface
307 CSVFeatureManager() :
310 _isHeaderParsed(false) { }
312 virtual ~CSVFeatureManager() { }
317 void setDelimiter(
char delimiter )
319 _delimiter = delimiter;
327 size_t getNumericTableNumberOfColumns()
const
329 if (_modifiersManager)
331 return _modifiersManager->getNumberOfOutputFeatures();
334 const size_t nDSCols = auxVect.size();
335 return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
342 services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
344 DAAL_CHECK( dictionary, services::ErrorNullPtr );
348 fillAuxVectAndFuncList(*dictionary);
349 _numberOfTokens = dictionary->getNumberOfFeatures();
351 return services::Status();
358 void addModifier(
const ModifierIface &modifier)
360 modifier.apply(funcList, auxVect);
370 CSVFeatureManager &addModifier(
const features::FeatureIdCollectionIfacePtr &featureIds,
371 const modifiers::csv::FeatureModifierIfacePtr &modifier,
372 services::Status *status = NULL)
374 services::Status localStatus;
375 if (!_modifiersManager)
377 _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
380 services::internal::tryAssignStatusAndThrow(status, localStatus);
385 localStatus |= _modifiersManager->addModifier(featureIds, modifier);
388 services::internal::tryAssignStatusAndThrow(status, localStatus);
400 void parseRowAsHeader(
char *rawRowData,
size_t rawDataSize)
402 DAAL_ASSERT( rawRowData );
404 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
405 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
407 _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
417 virtual void parseRowAsDictionary(
char *rawRowData,
size_t rawDataSize,
418 DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
420 DAAL_ASSERT( rawRowData );
421 DAAL_ASSERT( dictionary );
425 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
426 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
429 _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
432 if (_modifiersManager)
434 _modifiersManager->prepare(_featuresInfo);
435 _modifiersManager->fillDictionary(*dictionary);
439 fillDictionaryWithoutModifiers(*dictionary);
451 virtual void parseRowIn(
char *rawRowData,
size_t rawDataSize, DataSourceDictionary *dictionary,
452 NumericTable *nt,
size_t ntRowIndex) DAAL_C11_OVERRIDE
455 DAAL_ASSERT( dictionary );
456 DAAL_ASSERT( rawRowData );
458 nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
459 services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
460 _currentRowBlock.getNumberOfColumns());
463 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
465 if (_modifiersManager)
467 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
469 _modifiersManager->setToken(i, tokenizer.getCurrentToken());
471 _modifiersManager->applyModifiers(rowBuffer);
475 DAAL_DATA_TYPE *row = rowBuffer.data();
476 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
478 const services::StringView token = tokenizer.getCurrentToken();
479 funcList[i](token.c_str(), auxVect[i], row);
483 nt->releaseBlockOfRows(_currentRowBlock);
490 void finalize(DataSourceDictionary *dictionary)
492 if (_modifiersManager)
494 _modifiersManager->finalize();
495 _modifiersManager->fillDictionary(*dictionary);
500 void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
502 const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
503 dictionary.setNumberOfFeatures(nFeatures);
505 for (
size_t i = 0; i < nFeatures; i++)
507 features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
508 dictionary[i].ntFeature.featureType = fType;
512 case features::DAAL_CONTINUOUS:
513 dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
516 case features::DAAL_ORDINAL:
517 case features::DAAL_CATEGORICAL:
518 dictionary[i].ntFeature.setType<
int>();
523 fillAuxVectAndFuncList(dictionary);
526 void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
528 const size_t nFeatures = dictionary.getNumberOfFeatures();
529 auxVect.resize(nFeatures);
530 funcList.resize(nFeatures);
532 for (
size_t i = 0; i < nFeatures; i++)
534 DataSourceFeature &feature = dictionary[i];
535 NumericTableFeature &ntFeature = feature.ntFeature;
537 auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
538 funcList.push_back(getModifierFunctionPtr(ntFeature));
542 static functionT getModifierFunctionPtr(
const NumericTableFeature &ntFeature)
544 switch (ntFeature.featureType)
546 case features::DAAL_CONTINUOUS:
547 return ModifierIface::contFunc;
549 case features::DAAL_ORDINAL:
550 case features::DAAL_CATEGORICAL:
551 return ModifierIface::catFunc;
553 return ModifierIface::nullFunc;
558 services::Collection<functionT> funcList;
559 services::Collection<FeatureAuxData> auxVect;
562 bool _isHeaderParsed;
563 size_t _numberOfTokens;
564 BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
566 internal::CSVFeaturesInfo _featuresInfo;
567 modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
572 using interface1::CSVFeatureManager;
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:171
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:327
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:205
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:41
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:95
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:147
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:400
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:301
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:342
daal::services::ErrorNullPtr
Definition: error_indexes.h:139
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:285
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:317
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:370
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:69
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:575
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:358
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:266
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:307
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:451
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:417
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:490
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:161