22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
28 #include "services/daal_memory.h"
29 #include "data_management/data_source/data_source.h"
30 #include "data_management/data_source/data_source_dictionary.h"
31 #include "data_management/data/numeric_table.h"
32 #include "data_management/data/homogen_numeric_table.h"
36 namespace data_management
45 FeatureAuxData() : idx(0), wide(1), dsFeat(0), ntFeat(0), nCats(0) {};
49 DataSourceFeature *dsFeat;
50 NumericTableFeature *ntFeat;
53 typedef void (*functionT)(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
62 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const = 0;
64 virtual ~ModifierIface() {}
66 static void contFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
69 readNumeric<>( word, f );
73 static void catFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
75 std::string sWord(word);
77 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
78 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
80 if( it != catDict->end() )
82 arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
87 int index = (int)(catDict->size());
88 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
89 arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
90 aux.ntFeat->categoryNumber = index + 1;
96 static void readNumeric(
const char *text, T &f)
98 f = daal::services::daal_string_to_float(text, 0);
101 static void binFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
103 std::string sWord(word);
105 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
106 CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
110 if( it != catDict->end() )
112 index = it->second.first;
117 index = catDict->size();
118 catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((
int)index, 1) ) );
119 aux.ntFeat->categoryNumber = index + 1;
122 size_t nCats = aux.nCats;
124 for(
size_t i=0; i<nCats; i++)
126 arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
130 static void nullFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) {}
138 class MakeCategorical :
public ModifierIface
142 MakeCategorical(
size_t idx) : idx(idx) {}
144 virtual ~MakeCategorical() {}
146 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
148 size_t nCols = funcList.size();
152 funcList[idx] = catFunc;
161 class OneHotEncoder :
public ModifierIface
166 OneHotEncoder(
size_t idx,
size_t nCats) : idx(idx), nCats(nCats) {}
168 virtual ~OneHotEncoder() {}
170 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
172 size_t nCols = funcList.size();
176 funcList[idx] = binFunc;
177 auxVect[idx].nCats = nCats;
178 auxVect[idx].wide = nCats;
182 for(
size_t i=0; i<nCols; i++)
184 auxVect[i].idx = nNTCols;
185 nNTCols += auxVect[i].wide;
194 class ColumnFilter :
public ModifierIface
200 services::Collection<size_t> validList;
202 ColumnFilter() : oddFlag(
false), evenFlag(
false), noneFlag(
false), listFlag(
false) {}
204 virtual ~ColumnFilter() {}
206 ColumnFilter& odd() { oddFlag=
true;
return *
this;}
207 ColumnFilter& even() {evenFlag=
true;
return *
this;}
208 ColumnFilter& none() {noneFlag=
true;
return *
this;}
209 ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=
true;
return *
this;}
211 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
213 size_t nCols = funcList.size();
217 for(
size_t i=0; i<nCols; i+=2)
219 funcList[i] = nullFunc;
226 for(
size_t i=1; i<nCols; i+=2)
228 funcList[i] = nullFunc;
235 for(
size_t i=0; i<nCols; i++)
237 funcList[i] = nullFunc;
244 services::Collection<bool> flags(nCols);
246 for(
size_t i=0; i<nCols; i++)
251 for(
size_t i=0; i<validList.size(); i++)
253 size_t el = validList[i];
260 for(
size_t i=0; i<nCols; i++)
262 if(flags[i])
continue;
263 funcList[i] = nullFunc;
269 for(
size_t i=0; i<nCols; i++)
271 auxVect[i].idx = nNTCols;
272 nNTCols += auxVect[i].wide;
289 class CSVFeatureManager :
public StringRowFeatureManagerIface
294 services::Collection<functionT> funcList;
295 services::Collection<FeatureAuxData> auxVect;
301 CSVFeatureManager() : _delimiter(
',') {}
303 virtual ~CSVFeatureManager() {}
308 void setDelimiter(
char delimiter )
310 _delimiter = delimiter;
314 size_t getNumericTableNumberOfColumns()
316 size_t nDSCols = auxVect.size();
317 return auxVect[nDSCols-1].idx + auxVect[nDSCols-1].wide;
320 virtual void parseRowAsDictionary(
char *rawRowData,
size_t rawDataSize,
321 DataSourceDictionary *dict ) DAAL_C11_OVERRIDE
323 char *word =
new char[rawDataSize + 1];
325 std::list<DataSourceFeature> featureList;
327 bool isEmpty =
false;
332 if (rawRowData[pos] ==
'\0') {
break; }
335 while (len < rawDataSize && rawRowData[pos] != _delimiter && rawRowData[pos] !=
'\0')
337 word[len] = rawRowData[pos];
344 if (rawRowData[pos] == _delimiter) pos++;
347 isEmpty = (word[0] == 0 || word[0] ==
'\r' || word[0] ==
'\n');
349 if (isEmpty) {
break; }
351 bool isNumeric = readNumericDetailed<>( word, f );
353 DataSourceFeature feat;
357 feat.setType<DAAL_DATA_TYPE>();
362 feat.ntFeature.featureType = data_feature_utils::DAAL_CATEGORICAL;
365 featureList.push_back(feat);
372 dict->setNumberOfFeatures(nCols);
375 for( std::list<DataSourceFeature>::iterator it = featureList.begin() ; it != featureList.end() ; it++ )
377 dict->setFeature( *it, idx );
379 if( idx == nCols ) {
break; }
382 initializeFeatureDetails(dict);
385 void setFeatureDetailsFromDictionary(DataSourceDictionary* dict)
387 initializeFeatureDetails(dict);
390 void addModifier(
const ModifierIface& modifier )
392 modifier.apply( funcList, auxVect );
403 virtual void parseRowIn (
char *rawRowData,
size_t rawDataSize, DataSourceDictionary *dict,
404 NumericTable *nt,
size_t ntRowIndex ) DAAL_C11_OVERRIDE
406 size_t dFeatures = auxVect.size();
408 char const **words =
new char const *[dFeatures];
415 nt->getBlockOfRows( ntRowIndex, 1, writeOnly, block );
416 DAAL_DATA_TYPE *row = block.getBlockPtr();
419 words[ pos ] = rawRowData;
421 for(
size_t i=0; i<rawDataSize; i++ )
423 if( rawRowData[i] == _delimiter )
429 words[pos] = rawRowData + i + 1;
433 rawRowData[rawDataSize] = 0;
435 const char* zeroStr =
"0";
436 for( pos++; pos<dFeatures; pos++ )
438 words[pos] = zeroStr;
441 for(
size_t i = 0; i < dFeatures; i++ )
443 funcList[i]( words[i], auxVect[i], row );
446 nt->releaseBlockOfRows( block );
452 BlockDescriptor<DAAL_DATA_TYPE> block;
455 bool readNumericDetailed(
char *text, T &f)
457 std::istringstream iss(text);
459 return !(iss.fail());
462 void initializeFeatureDetails(DataSourceDictionary* dict)
464 const size_t nCols = dict->getNumberOfFeatures();
465 funcList.resize(nCols);
466 auxVect.resize(nCols);
468 for(
size_t i=0; i<nCols; i++)
470 if( (*dict)[i].ntFeature.featureType == data_feature_utils::DAAL_CONTINUOUS )
472 funcList.push_back( ModifierIface::contFunc );
476 funcList.push_back( ModifierIface::catFunc );
478 auxVect.push_back( FeatureAuxData() );
480 auxVect[i].dsFeat = &(*dict)[i];
481 auxVect[i].ntFeat = &auxVect[i].dsFeat->ntFeature;
487 using interface1::CSVFeatureManager;
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:161
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:320
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:194
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:43
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:403
daal::data_management::interface1::DataSourceFeature::setType
void setType()
Definition: data_source_dictionary.h:128
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:138
daal::data_management::interface1::DataSourceFeature
Data structure that describes the Data Source feature.
Definition: data_source_dictionary.h:49
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:289
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:308
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:59
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:574
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:301
daal::data_management::interface1::BlockDescriptor< DAAL_DATA_TYPE >
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:158