48 #ifndef __CSV_FEATURE_MANAGER_H__ 49 #define __CSV_FEATURE_MANAGER_H__ 54 #include "services/daal_memory.h" 55 #include "data_management/data_source/data_source.h" 56 #include "data_management/data_source/data_source_dictionary.h" 57 #include "data_management/data/numeric_table.h" 58 #include "data_management/data/homogen_numeric_table.h" 62 namespace data_management
71 FeatureAuxData() : idx(0), wide(1), dsFeat(0), ntFeat(0), nCats(0) {};
75 DataSourceFeature *dsFeat;
76 NumericTableFeature *ntFeat;
79 typedef void (*functionT)(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
88 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const = 0;
90 virtual ~ModifierIface() {}
92 static void contFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
95 readNumeric<>( word, f );
99 static void catFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
101 std::string sWord(word);
103 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
104 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
106 if( it != catDict->end() )
108 arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
113 int index = (int)(catDict->size());
114 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
115 arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
116 aux.ntFeat->categoryNumber = index + 1;
122 static void readNumeric(
const char *text, T &f)
124 f = daal::services::daal_string_to_float(text, 0);
127 static void binFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
129 std::string sWord(word);
131 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
132 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
136 if( it != catDict->end() )
138 index = it->second.first;
143 index = catDict->size();
144 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((
int)index, 1) ) );
145 aux.ntFeat->categoryNumber = index + 1;
148 size_t nCats = aux.nCats;
150 for(
size_t i=0; i<nCats; i++)
152 arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
156 static void nullFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) {}
164 class MakeCategorical :
public ModifierIface
168 MakeCategorical(
size_t idx) : idx(idx) {}
170 virtual ~MakeCategorical() {}
172 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const 174 size_t nCols = funcList.size();
178 funcList[idx] = catFunc;
187 class OneHotEncoder :
public ModifierIface
192 OneHotEncoder(
size_t idx,
size_t nCats) : idx(idx), nCats(nCats) {}
194 virtual ~OneHotEncoder() {}
196 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const 198 size_t nCols = funcList.size();
202 funcList[idx] = binFunc;
203 auxVect[idx].nCats = nCats;
204 auxVect[idx].wide = nCats;
208 for(
size_t i=0; i<nCols; i++)
210 auxVect[i].idx = nNTCols;
211 nNTCols += auxVect[i].wide;
220 class ColumnFilter :
public ModifierIface
226 services::Collection<size_t> validList;
228 ColumnFilter() : oddFlag(
false), evenFlag(
false), noneFlag(
false), listFlag(
false) {}
230 virtual ~ColumnFilter() {}
232 ColumnFilter& odd() { oddFlag=
true;
return *
this;}
233 ColumnFilter& even() {evenFlag=
true;
return *
this;}
234 ColumnFilter& none() {noneFlag=
true;
return *
this;}
235 ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=
true;
return *
this;}
237 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const 239 size_t nCols = funcList.size();
243 for(
size_t i=0; i<nCols; i+=2)
245 funcList[i] = nullFunc;
252 for(
size_t i=1; i<nCols; i+=2)
254 funcList[i] = nullFunc;
261 for(
size_t i=0; i<nCols; i++)
263 funcList[i] = nullFunc;
270 services::Collection<bool> flags(nCols);
272 for(
size_t i=0; i<nCols; i++)
277 for(
size_t i=0; i<validList.size(); i++)
279 size_t el = validList[i];
286 for(
size_t i=0; i<nCols; i++)
288 if(flags[i])
continue;
289 funcList[i] = nullFunc;
295 for(
size_t i=0; i<nCols; i++)
297 auxVect[i].idx = nNTCols;
298 nNTCols += auxVect[i].wide;
315 class CSVFeatureManager :
public StringRowFeatureManagerIface
320 services::Collection<functionT> funcList;
321 services::Collection<FeatureAuxData> auxVect;
327 CSVFeatureManager() : _delimiter(
',') {}
329 virtual ~CSVFeatureManager() {}
334 void setDelimiter(
char delimiter )
336 _delimiter = delimiter;
340 size_t getNumericTableNumberOfColumns()
342 size_t nDSCols = auxVect.size();
343 return auxVect[nDSCols-1].idx + auxVect[nDSCols-1].wide;
346 virtual void parseRowAsDictionary(
char *rawRowData,
size_t rawDataSize,
347 DataSourceDictionary *dict ) DAAL_C11_OVERRIDE
349 char *word =
new char[rawDataSize + 1];
351 std::list<DataSourceFeature> featureList;
353 bool isEmpty =
false;
358 if (rawRowData[pos] ==
'\0') {
break; }
361 while (len < rawDataSize && rawRowData[pos] != _delimiter && rawRowData[pos] !=
'\0')
363 word[len] = rawRowData[pos];
370 if (rawRowData[pos] == _delimiter) pos++;
373 isEmpty = (word[0] == 0 || word[0] ==
'\r' || word[0] ==
'\n');
375 if (isEmpty) {
break; }
377 bool isNumeric = readNumericDetailed<>( word, f );
379 DataSourceFeature feat;
383 feat.setType<DAAL_DATA_TYPE>();
388 feat.ntFeature.featureType = data_feature_utils::DAAL_CATEGORICAL;
391 featureList.push_back(feat);
398 dict->setNumberOfFeatures(nCols);
401 for( std::list<DataSourceFeature>::iterator it = featureList.begin() ; it != featureList.end() ; it++ )
403 dict->setFeature( *it, idx );
405 if( idx == nCols ) {
break; }
408 initializeFeatureDetails(dict);
411 void setFeatureDetailsFromDictionary(DataSourceDictionary* dict)
413 initializeFeatureDetails(dict);
416 void addModifier(
const ModifierIface& modifier )
418 modifier.apply( funcList, auxVect );
429 virtual void parseRowIn (
char *rawRowData,
size_t rawDataSize, DataSourceDictionary *dict,
430 NumericTable *nt,
size_t ntRowIndex ) DAAL_C11_OVERRIDE
432 size_t dFeatures = auxVect.size();
434 char const **words =
new char const *[dFeatures];
441 nt->getBlockOfRows( ntRowIndex, 1, writeOnly, block );
442 DAAL_DATA_TYPE *row = block.getBlockPtr();
445 words[ pos ] = rawRowData;
447 for(
size_t i=0; i<rawDataSize; i++ )
449 if( rawRowData[i] == _delimiter )
455 words[pos] = rawRowData + i + 1;
459 rawRowData[rawDataSize] = 0;
461 const char* zeroStr =
"0";
462 for( pos++; pos<dFeatures; pos++ )
464 words[pos] = zeroStr;
467 for(
size_t i = 0; i < dFeatures; i++ )
469 funcList[i]( words[i], auxVect[i], row );
472 nt->releaseBlockOfRows( block );
478 BlockDescriptor<DAAL_DATA_TYPE> block;
481 bool readNumericDetailed(
char *text, T &f)
483 std::istringstream iss(text);
485 return !(iss.fail());
488 void initializeFeatureDetails(DataSourceDictionary* dict)
490 const size_t nCols = dict->getNumberOfFeatures();
491 funcList.resize(nCols);
492 auxVect.resize(nCols);
494 for(
size_t i=0; i<nCols; i++)
496 if( (*dict)[i].ntFeature.featureType == data_feature_utils::DAAL_CONTINUOUS )
498 funcList.push_back( ModifierIface::contFunc );
502 funcList.push_back( ModifierIface::catFunc );
504 auxVect.push_back( FeatureAuxData() );
506 auxVect[i].dsFeat = &(*dict)[i];
507 auxVect[i].ntFeat = &auxVect[i].dsFeat->ntFeature;
513 using interface1::CSVFeatureManager;
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:187
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:346
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:308
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:220
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:69
daal
Definition: algorithm_base_common.h:57
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:429
daal::data_management::interface1::DataSourceFeature::setType
void setType()
Definition: data_source_dictionary.h:154
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:70
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:164
daal::data_management::interface1::DataSourceFeature
Data structure that describes the Data Source feature.
Definition: data_source_dictionary.h:75
daal::data_management::interface1::CategoricalFeatureDictionary
Definition: data_source_dictionary.h:66
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:315
daal::services::interface1::Collection::push_back
Collection & push_back(const T &x)
Definition: collection.h:197
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:334
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:85
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:600
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:327
daal::services::interface1::Collection::size
size_t size() const
Definition: collection.h:144
daal::data_management::interface1::DataSourceFeature::getCategoricalDictionary
CategoricalFeatureDictionary * getCategoricalDictionary()
Definition: data_source_dictionary.h:124
daal::data_management::interface1::BlockDescriptor< DAAL_DATA_TYPE >
daal::data_management::interface1::NumericTableFeature
Data structure describes the Numeric Table feature.
Definition: data_dictionary.h:74
daal::services::interface1::Collection::resize
bool resize(size_t newCapacity)
Definition: collection.h:222
daal::services::interface1::Collection
Class that implements functionality of the Collection container.
Definition: collection.h:69
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:184