C++ API Reference for Intel® Data Analytics Acceleration Library 2019

csv_feature_manager.h
1 /* file: csv_feature_manager.h */
2 /*******************************************************************************
3 * Copyright 2014-2018 Intel Corporation.
4 *
5 * This software and the related documents are Intel copyrighted materials, and
6 * your use of them is governed by the express license under which they were
7 * provided to you (License). Unless the License provides otherwise, you may not
8 * use, modify, copy, publish, distribute, disclose or transmit this software or
9 * the related documents without Intel's prior written permission.
10 *
11 * This software and the related documents are provided as is, with no express
12 * or implied warranties, other than those that are expressly stated in the
13 * License.
14 *******************************************************************************/
15 
16 /*
17 //++
18 // Implementation of the CSV feature manager class.
19 //--
20 */
21 
22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
24 
25 #include "data_management/data/numeric_table.h"
26 #include "data_management/features/shortcuts.h"
27 #include "data_management/data_source/data_source.h"
28 #include "data_management/data_source/internal/csv_feature_utils.h"
29 #include "data_management/data_source/modifiers/csv/shortcuts.h"
30 #include "data_management/data_source/modifiers/csv/internal/engine.h"
31 
32 namespace daal
33 {
34 namespace data_management
35 {
36 
41 struct FeatureAuxData
42 {
43  FeatureAuxData() :
44  idx(0), wide(1),
45  dsFeat(0), ntFeat(0), nCats(0) { }
46 
47  explicit FeatureAuxData(size_t index,
48  DataSourceFeature *dataSourceFeature,
49  NumericTableFeature *numericTableFeature) :
50  idx(index),
51  dsFeat(dataSourceFeature),
52  ntFeat(numericTableFeature),
53  wide(1), nCats(0) { }
54 
55  size_t idx;
56  size_t wide;
57  size_t nCats;
58  DataSourceFeature *dsFeat;
59  NumericTableFeature *ntFeat;
60 };
61 
62 typedef void (*functionT)(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
63 
68 class ModifierIface
69 {
70 public:
71  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const = 0;
72 
73  virtual ~ModifierIface() {}
74 
75  static void contFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
76  {
77  DAAL_DATA_TYPE f;
78  readNumeric<>( word, f );
79  arr[ aux.idx ] = f;
80  }
81 
82  static void catFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
83  {
84  std::string sWord(word);
85 
86  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
87  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
88 
89  if( it != catDict->end() )
90  {
91  arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
92  it->second.second++;
93  }
94  else
95  {
96  int index = (int)(catDict->size());
97  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
98  arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
99  aux.ntFeat->categoryNumber = index + 1;
100  }
101  }
102 
103  static void nullFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
104 
105 protected:
106  template<class T>
107  static void readNumeric(const char *text, T &f)
108  {
109  f = daal::services::daal_string_to_float(text, 0);
110  }
111 
112  static void binFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
113  {
114  std::string sWord(word);
115 
116  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
117  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
118 
119  size_t index = 0;
120 
121  if( it != catDict->end() )
122  {
123  index = it->second.first;
124  it->second.second++;
125  }
126  else
127  {
128  index = catDict->size();
129  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((int)index, 1) ) );
130  aux.ntFeat->categoryNumber = index + 1;
131  }
132 
133  size_t nCats = aux.nCats;
134 
135  for(size_t i=0; i<nCats; i++)
136  {
137  arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
138  }
139  }
140 };
141 
146 class MakeCategorical : public ModifierIface
147 {
148  size_t idx;
149 public:
150  MakeCategorical(size_t idx) : idx(idx) {}
151 
152  virtual ~MakeCategorical() {}
153 
154  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
155  {
156  size_t nCols = funcList.size();
157 
158  if(idx < nCols)
159  {
160  funcList[idx] = catFunc;
161  }
162  }
163 };
164 
169 class OneHotEncoder : public ModifierIface
170 {
171  size_t idx;
172  size_t nCats;
173 public:
174  OneHotEncoder(size_t idx, size_t nCats) : idx(idx), nCats(nCats) {}
175 
176  virtual ~OneHotEncoder() {}
177 
178  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
179  {
180  size_t nCols = funcList.size();
181 
182  if(idx < nCols)
183  {
184  funcList[idx] = binFunc;
185  auxVect[idx].nCats = nCats;
186  auxVect[idx].wide = nCats;
187  }
188 
189  size_t nNTCols = 0;
190  for(size_t i=0; i<nCols; i++)
191  {
192  auxVect[i].idx = nNTCols;
193  nNTCols += auxVect[i].wide;
194  }
195  }
196 };
197 
202 class ColumnFilter : public ModifierIface
203 {
204  bool oddFlag;
205  bool evenFlag;
206  bool noneFlag;
207  bool listFlag;
208  services::Collection<size_t> validList;
209 public:
210  ColumnFilter() : oddFlag(false), evenFlag(false), noneFlag(false), listFlag(false) {}
211 
212  virtual ~ColumnFilter() {}
213 
214  ColumnFilter& odd() { oddFlag=true; return *this;}
215  ColumnFilter& even() {evenFlag=true; return *this;}
216  ColumnFilter& none() {noneFlag=true; return *this;}
217  ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=true; return *this;}
218 
219  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
220  {
221  size_t nCols = funcList.size();
222 
223  if( oddFlag )
224  {
225  for(size_t i=0; i<nCols; i+=2)
226  {
227  funcList[i] = nullFunc;
228  auxVect[i].wide = 0;
229  }
230  }
231 
232  if( evenFlag )
233  {
234  for(size_t i=1; i<nCols; i+=2)
235  {
236  funcList[i] = nullFunc;
237  auxVect[i].wide = 0;
238  }
239  }
240 
241  if( noneFlag )
242  {
243  for(size_t i=0; i<nCols; i++)
244  {
245  funcList[i] = nullFunc;
246  auxVect[i].wide = 0;
247  }
248  }
249 
250  if( listFlag )
251  {
252  services::Collection<bool> flags(nCols);
253 
254  for(size_t i=0; i<nCols; i++)
255  {
256  flags[i] = false;
257  }
258 
259  for(size_t i=0; i<validList.size(); i++)
260  {
261  size_t el = validList[i];
262  if(el<nCols)
263  {
264  flags[el] = true;
265  }
266  }
267 
268  for(size_t i=0; i<nCols; i++)
269  {
270  if(flags[i]) continue;
271  funcList[i] = nullFunc;
272  auxVect[i].wide = 0;
273  }
274  }
275 
276  size_t nNTCols = 0;
277  for(size_t i=0; i<nCols; i++)
278  {
279  auxVect[i].idx = nNTCols;
280  nNTCols += auxVect[i].wide;
281  }
282  }
283 };
284 
285 namespace interface1
286 {
287 
298 class CSVFeatureManager : public StringRowFeatureManagerIface
299 {
300 public:
304  CSVFeatureManager() :
305  _delimiter(','),
306  _numberOfTokens(0),
307  _isHeaderParsed(false) { }
308 
309  virtual ~CSVFeatureManager() { }
310 
314  void setDelimiter( char delimiter )
315  {
316  _delimiter = delimiter;
317  }
318 
319 public:
324  size_t getNumericTableNumberOfColumns() const
325  {
326  if (_modifiersManager)
327  {
328  return _modifiersManager->getNumberOfOutputFeatures();
329  }
330 
331  const size_t nDSCols = auxVect.size();
332  return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
333  }
334 
339  services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
340  {
341  DAAL_CHECK( dictionary, services::ErrorNullPtr );
342 
343  auxVect.clear();
344  funcList.clear();
345  fillAuxVectAndFuncList(*dictionary);
346  _numberOfTokens = dictionary->getNumberOfFeatures();
347 
348  return services::Status();
349  }
350 
355  void addModifier(const ModifierIface &modifier)
356  {
357  modifier.apply(funcList, auxVect);
358  }
359 
367  CSVFeatureManager &addModifier(const features::FeatureIdCollectionIfacePtr &featureIds,
368  const modifiers::csv::FeatureModifierIfacePtr &modifier,
369  services::Status *status = NULL)
370  {
371  services::Status localStatus;
372  if (!_modifiersManager)
373  {
374  _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
375  if (!localStatus)
376  {
377  services::internal::tryAssignStatusAndThrow(status, localStatus);
378  return *this;
379  }
380  }
381 
382  localStatus |= _modifiersManager->addModifier(featureIds, modifier);
383  if (!localStatus)
384  {
385  services::internal::tryAssignStatusAndThrow(status, localStatus);
386  return *this;
387  }
388 
389  return *this;
390  }
391 
397  void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
398  {
399  DAAL_ASSERT( rawRowData );
400 
401  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
402  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
403  {
404  _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
405  }
406  }
407 
414  virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize,
415  DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
416  {
417  DAAL_ASSERT( rawRowData );
418  DAAL_ASSERT( dictionary );
419 
420  _numberOfTokens = 0;
421 
422  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
423  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
424  {
425  _numberOfTokens++;
426  _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
427  }
428 
429  if (_modifiersManager)
430  {
431  _modifiersManager->prepare(_featuresInfo);
432  _modifiersManager->fillDictionary(*dictionary);
433  }
434  else
435  {
436  fillDictionaryWithoutModifiers(*dictionary);
437  }
438  }
439 
448  virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary,
449  NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
450  {
451  DAAL_ASSERT( nt );
452  DAAL_ASSERT( dictionary );
453  DAAL_ASSERT( rawRowData );
454 
455  nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
456  services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
457  _currentRowBlock.getNumberOfColumns());
458 
459  size_t i = 0;
460  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
461 
462  if (_modifiersManager)
463  {
464  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
465  {
466  _modifiersManager->setToken(i, tokenizer.getCurrentToken());
467  }
468  _modifiersManager->applyModifiers(rowBuffer);
469  }
470  else
471  {
472  DAAL_DATA_TYPE *row = rowBuffer.data();
473  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
474  {
475  const services::StringView token = tokenizer.getCurrentToken();
476  funcList[i](token.c_str(), auxVect[i], row);
477  }
478  }
479 
480  nt->releaseBlockOfRows(_currentRowBlock);
481  }
482 
487  void finalize(DataSourceDictionary *dictionary)
488  {
489  if (_modifiersManager)
490  {
491  _modifiersManager->finalize();
492  _modifiersManager->fillDictionary(*dictionary);
493  }
494  }
495 
496 private:
497  void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
498  {
499  const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
500  dictionary.setNumberOfFeatures(nFeatures);
501 
502  for (size_t i = 0; i < nFeatures; i++)
503  {
504  features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
505  dictionary[i].ntFeature.featureType = fType;
506 
507  switch (fType)
508  {
509  case features::DAAL_CONTINUOUS:
510  dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
511  break;
512 
513  case features::DAAL_ORDINAL:
514  case features::DAAL_CATEGORICAL:
515  dictionary[i].ntFeature.setType<int>();
516  break;
517  }
518  }
519 
520  fillAuxVectAndFuncList(dictionary);
521  }
522 
523  void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
524  {
525  const size_t nFeatures = dictionary.getNumberOfFeatures();
526  auxVect.resize(nFeatures);
527  funcList.resize(nFeatures);
528 
529  for (size_t i = 0; i < nFeatures; i++)
530  {
531  DataSourceFeature &feature = dictionary[i];
532  NumericTableFeature &ntFeature = feature.ntFeature;
533 
534  auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
535  funcList.push_back(getModifierFunctionPtr(ntFeature));
536  }
537  }
538 
539  static functionT getModifierFunctionPtr(const NumericTableFeature &ntFeature)
540  {
541  switch (ntFeature.featureType)
542  {
543  case features::DAAL_CONTINUOUS:
544  return ModifierIface::contFunc;
545 
546  case features::DAAL_ORDINAL:
547  case features::DAAL_CATEGORICAL:
548  return ModifierIface::catFunc;
549  }
550  return ModifierIface::nullFunc;
551  }
552 
553 protected:
554  char _delimiter;
555  services::Collection<functionT> funcList;
556  services::Collection<FeatureAuxData> auxVect;
557 
558 private:
559  bool _isHeaderParsed;
560  size_t _numberOfTokens;
561  BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
562 
563  internal::CSVFeaturesInfo _featuresInfo;
564  modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
565 };
567 } // namespace interface1
568 
569 using interface1::CSVFeatureManager;
570 
571 } // namespace data_management
572 } // namespace daal
573 
574 #endif
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:169
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:324
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:202
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:41
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:95
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:146
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:397
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:298
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:339
daal::services::ErrorNullPtr
Definition: error_indexes.h:139
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:285
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:314
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:367
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:68
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:574
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:355
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:266
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:304
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:448
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:414
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:487
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:161

For more complete information about compiler optimizations, see our Optimization Notice.