22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
25 #include "data_management/data/numeric_table.h"
26 #include "data_management/features/shortcuts.h"
27 #include "data_management/data_source/data_source.h"
28 #include "data_management/data_source/internal/csv_feature_utils.h"
29 #include "data_management/data_source/modifiers/csv/shortcuts.h"
30 #include "data_management/data_source/modifiers/csv/internal/engine.h"
34 namespace data_management
45 dsFeat(0), ntFeat(0), nCats(0) { }
47 explicit FeatureAuxData(
size_t index,
48 DataSourceFeature *dataSourceFeature,
49 NumericTableFeature *numericTableFeature) :
51 dsFeat(dataSourceFeature),
52 ntFeat(numericTableFeature),
58 DataSourceFeature *dsFeat;
59 NumericTableFeature *ntFeat;
62 typedef void (*functionT)(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
71 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const = 0;
73 virtual ~ModifierIface() {}
75 static void contFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
78 readNumeric<>( word, f );
82 static void catFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
84 std::string sWord(word);
86 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
87 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
89 if( it != catDict->end() )
91 arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
96 int index = (int)(catDict->size());
97 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
98 arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
99 aux.ntFeat->categoryNumber = index + 1;
103 static void nullFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
107 static void readNumeric(
const char *text, T &f)
109 f = daal::services::daal_string_to_float(text, 0);
112 static void binFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
114 std::string sWord(word);
116 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
117 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
121 if( it != catDict->end() )
123 index = it->second.first;
128 index = catDict->size();
129 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((
int)index, 1) ) );
130 aux.ntFeat->categoryNumber = index + 1;
133 size_t nCats = aux.nCats;
135 for(
size_t i=0; i<nCats; i++)
137 arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
146 class MakeCategorical :
public ModifierIface
150 MakeCategorical(
size_t idx) : idx(idx) {}
152 virtual ~MakeCategorical() {}
154 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
156 size_t nCols = funcList.size();
160 funcList[idx] = catFunc;
169 class OneHotEncoder :
public ModifierIface
174 OneHotEncoder(
size_t idx,
size_t nCats) : idx(idx), nCats(nCats) {}
176 virtual ~OneHotEncoder() {}
178 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
180 size_t nCols = funcList.size();
184 funcList[idx] = binFunc;
185 auxVect[idx].nCats = nCats;
186 auxVect[idx].wide = nCats;
190 for(
size_t i=0; i<nCols; i++)
192 auxVect[i].idx = nNTCols;
193 nNTCols += auxVect[i].wide;
202 class ColumnFilter :
public ModifierIface
208 services::Collection<size_t> validList;
210 ColumnFilter() : oddFlag(
false), evenFlag(
false), noneFlag(
false), listFlag(
false) {}
212 virtual ~ColumnFilter() {}
214 ColumnFilter& odd() { oddFlag=
true;
return *
this;}
215 ColumnFilter& even() {evenFlag=
true;
return *
this;}
216 ColumnFilter& none() {noneFlag=
true;
return *
this;}
217 ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=
true;
return *
this;}
219 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
221 size_t nCols = funcList.size();
225 for(
size_t i=0; i<nCols; i+=2)
227 funcList[i] = nullFunc;
234 for(
size_t i=1; i<nCols; i+=2)
236 funcList[i] = nullFunc;
243 for(
size_t i=0; i<nCols; i++)
245 funcList[i] = nullFunc;
252 services::Collection<bool> flags(nCols);
254 for(
size_t i=0; i<nCols; i++)
259 for(
size_t i=0; i<validList.size(); i++)
261 size_t el = validList[i];
268 for(
size_t i=0; i<nCols; i++)
270 if(flags[i])
continue;
271 funcList[i] = nullFunc;
277 for(
size_t i=0; i<nCols; i++)
279 auxVect[i].idx = nNTCols;
280 nNTCols += auxVect[i].wide;
298 class CSVFeatureManager :
public StringRowFeatureManagerIface
304 CSVFeatureManager() :
307 _isHeaderParsed(false) { }
309 virtual ~CSVFeatureManager() { }
314 void setDelimiter(
char delimiter )
316 _delimiter = delimiter;
324 size_t getNumericTableNumberOfColumns()
const
326 if (_modifiersManager)
328 return _modifiersManager->getNumberOfOutputFeatures();
331 const size_t nDSCols = auxVect.size();
332 return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
339 services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
341 DAAL_CHECK( dictionary, services::ErrorNullPtr );
345 fillAuxVectAndFuncList(*dictionary);
346 _numberOfTokens = dictionary->getNumberOfFeatures();
348 return services::Status();
355 void addModifier(
const ModifierIface &modifier)
357 modifier.apply(funcList, auxVect);
367 CSVFeatureManager &addModifier(
const features::FeatureIdCollectionIfacePtr &featureIds,
368 const modifiers::csv::FeatureModifierIfacePtr &modifier,
369 services::Status *status = NULL)
371 services::Status localStatus;
372 if (!_modifiersManager)
374 _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
377 services::internal::tryAssignStatusAndThrow(status, localStatus);
382 localStatus |= _modifiersManager->addModifier(featureIds, modifier);
385 services::internal::tryAssignStatusAndThrow(status, localStatus);
397 void parseRowAsHeader(
char *rawRowData,
size_t rawDataSize)
399 DAAL_ASSERT( rawRowData );
401 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
402 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
404 _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
414 virtual void parseRowAsDictionary(
char *rawRowData,
size_t rawDataSize,
415 DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
417 DAAL_ASSERT( rawRowData );
418 DAAL_ASSERT( dictionary );
422 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
423 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
426 _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
429 if (_modifiersManager)
431 _modifiersManager->prepare(_featuresInfo);
432 _modifiersManager->fillDictionary(*dictionary);
436 fillDictionaryWithoutModifiers(*dictionary);
448 virtual void parseRowIn(
char *rawRowData,
size_t rawDataSize, DataSourceDictionary *dictionary,
449 NumericTable *nt,
size_t ntRowIndex) DAAL_C11_OVERRIDE
452 DAAL_ASSERT( dictionary );
453 DAAL_ASSERT( rawRowData );
455 nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
456 services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
457 _currentRowBlock.getNumberOfColumns());
460 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
462 if (_modifiersManager)
464 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
466 _modifiersManager->setToken(i, tokenizer.getCurrentToken());
468 _modifiersManager->applyModifiers(rowBuffer);
472 DAAL_DATA_TYPE *row = rowBuffer.data();
473 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
475 const services::StringView token = tokenizer.getCurrentToken();
476 funcList[i](token.c_str(), auxVect[i], row);
480 nt->releaseBlockOfRows(_currentRowBlock);
487 void finalize(DataSourceDictionary *dictionary)
489 if (_modifiersManager)
491 _modifiersManager->finalize();
492 _modifiersManager->fillDictionary(*dictionary);
497 void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
499 const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
500 dictionary.setNumberOfFeatures(nFeatures);
502 for (
size_t i = 0; i < nFeatures; i++)
504 features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
505 dictionary[i].ntFeature.featureType = fType;
509 case features::DAAL_CONTINUOUS:
510 dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
513 case features::DAAL_ORDINAL:
514 case features::DAAL_CATEGORICAL:
515 dictionary[i].ntFeature.setType<
int>();
520 fillAuxVectAndFuncList(dictionary);
523 void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
525 const size_t nFeatures = dictionary.getNumberOfFeatures();
526 auxVect.resize(nFeatures);
527 funcList.resize(nFeatures);
529 for (
size_t i = 0; i < nFeatures; i++)
531 DataSourceFeature &feature = dictionary[i];
532 NumericTableFeature &ntFeature = feature.ntFeature;
534 auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
535 funcList.push_back(getModifierFunctionPtr(ntFeature));
539 static functionT getModifierFunctionPtr(
const NumericTableFeature &ntFeature)
541 switch (ntFeature.featureType)
543 case features::DAAL_CONTINUOUS:
544 return ModifierIface::contFunc;
546 case features::DAAL_ORDINAL:
547 case features::DAAL_CATEGORICAL:
548 return ModifierIface::catFunc;
550 return ModifierIface::nullFunc;
555 services::Collection<functionT> funcList;
556 services::Collection<FeatureAuxData> auxVect;
559 bool _isHeaderParsed;
560 size_t _numberOfTokens;
561 BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
563 internal::CSVFeaturesInfo _featuresInfo;
564 modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
569 using interface1::CSVFeatureManager;
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:169
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:324
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:202
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:41
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:95
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:146
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:397
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:298
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:339
daal::services::ErrorNullPtr
Definition: error_indexes.h:139
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:285
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:314
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:367
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:68
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:574
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:355
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:266
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:304
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:448
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:414
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:487
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:161