C++ API Reference for Intel® Data Analytics Acceleration Library 2019 Update 4

csv_feature_manager.h
1 /* file: csv_feature_manager.h */
2 /*******************************************************************************
3 * Copyright 2014-2019 Intel Corporation.
4 *
5 * This software and the related documents are Intel copyrighted materials, and
6 * your use of them is governed by the express license under which they were
7 * provided to you (License). Unless the License provides otherwise, you may not
8 * use, modify, copy, publish, distribute, disclose or transmit this software or
9 * the related documents without Intel's prior written permission.
10 *
11 * This software and the related documents are provided as is, with no express
12 * or implied warranties, other than those that are expressly stated in the
13 * License.
14 *******************************************************************************/
15 
16 /*
17 //++
18 // Implementation of the CSV feature manager class.
19 //--
20 */
21 
22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
24 
25 #include "data_management/data/numeric_table.h"
26 #include "data_management/features/shortcuts.h"
27 #include "data_management/data_source/data_source.h"
28 #include "data_management/data_source/internal/csv_feature_utils.h"
29 #include "data_management/data_source/modifiers/csv/shortcuts.h"
30 #include "data_management/data_source/modifiers/csv/internal/engine.h"
31 
32 namespace daal
33 {
34 namespace data_management
35 {
36 
41 struct FeatureAuxData
42 {
43  FeatureAuxData() :
44  idx(0), wide(1),
45  dsFeat(0), ntFeat(0), nCats(0) { }
46 
47  explicit FeatureAuxData(size_t index,
48  DataSourceFeature *dataSourceFeature,
49  NumericTableFeature *numericTableFeature) :
50  idx(index),
51  dsFeat(dataSourceFeature),
52  ntFeat(numericTableFeature),
53  wide(1), nCats(0), buffer() { }
54 
55  size_t idx;
56  size_t wide;
57  size_t nCats;
58  DataSourceFeature *dsFeat;
59  NumericTableFeature *ntFeat;
60  std::string buffer;
61 };
62 
63 typedef void (*functionT)(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
64 
69 class ModifierIface
70 {
71 public:
72  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const = 0;
73 
74  virtual ~ModifierIface() {}
75 
76  static void contFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
77  {
78  DAAL_DATA_TYPE f;
79  readNumeric<>( word, f );
80  arr[ aux.idx ] = f;
81  }
82 
83  static void catFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
84  {
85  aux.buffer.assign(word);
86 
87  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
88  CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
89 
90  if( it != catDict->end() )
91  {
92  arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
93  it->second.second++;
94  }
95  else
96  {
97  int index = (int)(catDict->size());
98  catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>(index, 1) ) );
99  arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
100  aux.ntFeat->categoryNumber = index + 1;
101  }
102  }
103 
104  static void nullFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
105 
106 protected:
107  template<class T>
108  static void readNumeric(const char *text, T &f)
109  {
110  f = daal::services::daal_string_to_float(text, 0);
111  }
112 
113  static void binFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
114  {
115  aux.buffer.assign(word);
116 
117  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
118  CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
119 
120  size_t index = 0;
121 
122  if( it != catDict->end() )
123  {
124  index = it->second.first;
125  it->second.second++;
126  }
127  else
128  {
129  index = catDict->size();
130  catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>((int)index, 1) ) );
131  aux.ntFeat->categoryNumber = index + 1;
132  }
133 
134  size_t nCats = aux.nCats;
135 
136  for(size_t i=0; i<nCats; i++)
137  {
138  arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
139  }
140  }
141 };
142 
147 class MakeCategorical : public ModifierIface
148 {
149  size_t idx;
150 public:
151  MakeCategorical(size_t idx) : idx(idx) {}
152 
153  virtual ~MakeCategorical() {}
154 
155  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
156  {
157  size_t nCols = funcList.size();
158 
159  if(idx < nCols)
160  {
161  funcList[idx] = catFunc;
162  auxVect[idx].buffer.resize(1024);
163  }
164  }
165 };
166 
171 class OneHotEncoder : public ModifierIface
172 {
173  size_t idx;
174  size_t nCats;
175 public:
176  OneHotEncoder(size_t idx, size_t nCats) : idx(idx), nCats(nCats) {}
177 
178  virtual ~OneHotEncoder() {}
179 
180  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
181  {
182  size_t nCols = funcList.size();
183 
184  if(idx < nCols)
185  {
186  funcList[idx] = binFunc;
187  auxVect[idx].buffer.resize(1024);
188  auxVect[idx].nCats = nCats;
189  auxVect[idx].wide = nCats;
190  }
191 
192  size_t nNTCols = 0;
193  for(size_t i=0; i<nCols; i++)
194  {
195  auxVect[i].idx = nNTCols;
196  nNTCols += auxVect[i].wide;
197  }
198  }
199 };
200 
205 class ColumnFilter : public ModifierIface
206 {
207  bool oddFlag;
208  bool evenFlag;
209  bool noneFlag;
210  bool listFlag;
211  services::Collection<size_t> validList;
212 public:
213  ColumnFilter() : oddFlag(false), evenFlag(false), noneFlag(false), listFlag(false) {}
214 
215  virtual ~ColumnFilter() {}
216 
217  ColumnFilter& odd() { oddFlag=true; return *this;}
218  ColumnFilter& even() {evenFlag=true; return *this;}
219  ColumnFilter& none() {noneFlag=true; return *this;}
220  ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=true; return *this;}
221 
222  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
223  {
224  size_t nCols = funcList.size();
225 
226  if( oddFlag )
227  {
228  for(size_t i=0; i<nCols; i+=2)
229  {
230  funcList[i] = nullFunc;
231  auxVect[i].wide = 0;
232  }
233  }
234 
235  if( evenFlag )
236  {
237  for(size_t i=1; i<nCols; i+=2)
238  {
239  funcList[i] = nullFunc;
240  auxVect[i].wide = 0;
241  }
242  }
243 
244  if( noneFlag )
245  {
246  for(size_t i=0; i<nCols; i++)
247  {
248  funcList[i] = nullFunc;
249  auxVect[i].wide = 0;
250  }
251  }
252 
253  if( listFlag )
254  {
255  services::Collection<bool> flags(nCols);
256 
257  for(size_t i=0; i<nCols; i++)
258  {
259  flags[i] = false;
260  }
261 
262  for(size_t i=0; i<validList.size(); i++)
263  {
264  size_t el = validList[i];
265  if(el<nCols)
266  {
267  flags[el] = true;
268  }
269  }
270 
271  for(size_t i=0; i<nCols; i++)
272  {
273  if(flags[i]) continue;
274  funcList[i] = nullFunc;
275  auxVect[i].wide = 0;
276  }
277  }
278 
279  size_t nNTCols = 0;
280  for(size_t i=0; i<nCols; i++)
281  {
282  auxVect[i].idx = nNTCols;
283  nNTCols += auxVect[i].wide;
284  }
285  }
286 };
287 
288 namespace interface1
289 {
290 
301 class CSVFeatureManager : public StringRowFeatureManagerIface
302 {
303 public:
307  CSVFeatureManager() :
308  _delimiter(','),
309  _numberOfTokens(0),
310  _isHeaderParsed(false) { }
311 
312  virtual ~CSVFeatureManager() { }
313 
317  void setDelimiter( char delimiter )
318  {
319  _delimiter = delimiter;
320  }
321 
322 public:
327  size_t getNumericTableNumberOfColumns() const
328  {
329  if (_modifiersManager)
330  {
331  return _modifiersManager->getNumberOfOutputFeatures();
332  }
333 
334  const size_t nDSCols = auxVect.size();
335  return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
336  }
337 
342  services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
343  {
344  DAAL_CHECK( dictionary, services::ErrorNullPtr );
345 
346  auxVect.clear();
347  funcList.clear();
348  fillAuxVectAndFuncList(*dictionary);
349  _numberOfTokens = dictionary->getNumberOfFeatures();
350 
351  return services::Status();
352  }
353 
358  void addModifier(const ModifierIface &modifier)
359  {
360  modifier.apply(funcList, auxVect);
361  }
362 
370  CSVFeatureManager &addModifier(const features::FeatureIdCollectionIfacePtr &featureIds,
371  const modifiers::csv::FeatureModifierIfacePtr &modifier,
372  services::Status *status = NULL)
373  {
374  services::Status localStatus;
375  if (!_modifiersManager)
376  {
377  _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
378  if (!localStatus)
379  {
380  services::internal::tryAssignStatusAndThrow(status, localStatus);
381  return *this;
382  }
383  }
384 
385  localStatus |= _modifiersManager->addModifier(featureIds, modifier);
386  if (!localStatus)
387  {
388  services::internal::tryAssignStatusAndThrow(status, localStatus);
389  return *this;
390  }
391 
392  return *this;
393  }
394 
400  void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
401  {
402  DAAL_ASSERT( rawRowData );
403 
404  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
405  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
406  {
407  _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
408  }
409  }
410 
417  virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize,
418  DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
419  {
420  DAAL_ASSERT( rawRowData );
421  DAAL_ASSERT( dictionary );
422 
423  _numberOfTokens = 0;
424 
425  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
426  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
427  {
428  _numberOfTokens++;
429  _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
430  }
431 
432  if (_modifiersManager)
433  {
434  _modifiersManager->prepare(_featuresInfo);
435  _modifiersManager->fillDictionary(*dictionary);
436  }
437  else
438  {
439  fillDictionaryWithoutModifiers(*dictionary);
440  }
441  }
442 
451  virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary,
452  NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
453  {
454  DAAL_ASSERT( nt );
455  DAAL_ASSERT( dictionary );
456  DAAL_ASSERT( rawRowData );
457 
458  nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
459  services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
460  _currentRowBlock.getNumberOfColumns());
461 
462  size_t i = 0;
463  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
464 
465  if (_modifiersManager)
466  {
467  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
468  {
469  _modifiersManager->setToken(i, tokenizer.getCurrentToken());
470  }
471  _modifiersManager->applyModifiers(rowBuffer);
472  }
473  else
474  {
475  DAAL_DATA_TYPE *row = rowBuffer.data();
476  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
477  {
478  const services::StringView token = tokenizer.getCurrentToken();
479  funcList[i](token.c_str(), auxVect[i], row);
480  }
481  }
482 
483  nt->releaseBlockOfRows(_currentRowBlock);
484  }
485 
490  void finalize(DataSourceDictionary *dictionary)
491  {
492  if (_modifiersManager)
493  {
494  _modifiersManager->finalize();
495  _modifiersManager->fillDictionary(*dictionary);
496  }
497  }
498 
499 private:
500  void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
501  {
502  const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
503  dictionary.setNumberOfFeatures(nFeatures);
504 
505  for (size_t i = 0; i < nFeatures; i++)
506  {
507  features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
508  dictionary[i].ntFeature.featureType = fType;
509 
510  switch (fType)
511  {
512  case features::DAAL_CONTINUOUS:
513  dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
514  break;
515 
516  case features::DAAL_ORDINAL:
517  case features::DAAL_CATEGORICAL:
518  dictionary[i].ntFeature.setType<int>();
519  break;
520  }
521  }
522 
523  fillAuxVectAndFuncList(dictionary);
524  }
525 
526  void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
527  {
528  const size_t nFeatures = dictionary.getNumberOfFeatures();
529  auxVect.resize(nFeatures);
530  funcList.resize(nFeatures);
531 
532  for (size_t i = 0; i < nFeatures; i++)
533  {
534  DataSourceFeature &feature = dictionary[i];
535  NumericTableFeature &ntFeature = feature.ntFeature;
536 
537  auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
538  funcList.push_back(getModifierFunctionPtr(ntFeature));
539  }
540  }
541 
542  static functionT getModifierFunctionPtr(const NumericTableFeature &ntFeature)
543  {
544  switch (ntFeature.featureType)
545  {
546  case features::DAAL_CONTINUOUS:
547  return ModifierIface::contFunc;
548 
549  case features::DAAL_ORDINAL:
550  case features::DAAL_CATEGORICAL:
551  return ModifierIface::catFunc;
552  }
553  return ModifierIface::nullFunc;
554  }
555 
556 protected:
557  char _delimiter;
558  services::Collection<functionT> funcList;
559  services::Collection<FeatureAuxData> auxVect;
560 
561 private:
562  bool _isHeaderParsed;
563  size_t _numberOfTokens;
564  BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
565 
566  internal::CSVFeaturesInfo _featuresInfo;
567  modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
568 };
570 } // namespace interface1
571 
572 using interface1::CSVFeatureManager;
573 
574 } // namespace data_management
575 } // namespace daal
576 
577 #endif
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:171
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:327
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:205
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:41
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:95
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:147
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:400
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:301
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:342
daal::services::ErrorNullPtr
Definition: error_indexes.h:139
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:37
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:285
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:317
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:370
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:69
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:574
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:358
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:266
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:307
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:451
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:417
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:490
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:161

For more complete information about compiler optimizations, see our Optimization Notice.