C++ API Reference for Intel® Data Analytics Acceleration Library 2018 Update 3

csv_feature_manager.h
1 /* file: csv_feature_manager.h */
2 /*******************************************************************************
3 * Copyright 2014-2018 Intel Corporation.
4 *
5 * This software and the related documents are Intel copyrighted materials, and
6 * your use of them is governed by the express license under which they were
7 * provided to you (License). Unless the License provides otherwise, you may not
8 * use, modify, copy, publish, distribute, disclose or transmit this software or
9 * the related documents without Intel's prior written permission.
10 *
11 * This software and the related documents are provided as is, with no express
12 * or implied warranties, other than those that are expressly stated in the
13 * License.
14 *******************************************************************************/
15 
16 /*
17 //++
18 // Implementation of the CSV feature manager class.
19 //--
20 */
21 
22 #ifndef __CSV_FEATURE_MANAGER_H__
23 #define __CSV_FEATURE_MANAGER_H__
24 
25 #include <sstream>
26 #include <list>
27 
28 #include "services/daal_memory.h"
29 #include "data_management/data_source/data_source.h"
30 #include "data_management/data_source/data_source_dictionary.h"
31 #include "data_management/data/numeric_table.h"
32 #include "data_management/data/homogen_numeric_table.h"
33 
34 namespace daal
35 {
36 namespace data_management
37 {
38 
43 struct FeatureAuxData
44 {
45  FeatureAuxData() : idx(0), wide(1), dsFeat(0), ntFeat(0), nCats(0) {};
46  size_t idx;
47  size_t wide;
48  size_t nCats;
49  DataSourceFeature *dsFeat;
50  NumericTableFeature *ntFeat;
51 };
52 
53 typedef void (*functionT)(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
54 
59 class ModifierIface
60 {
61 public:
62  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const = 0;
63 
64  virtual ~ModifierIface() {}
65 
66  static void contFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
67  {
68  DAAL_DATA_TYPE f;
69  readNumeric<>( word, f );
70  arr[ aux.idx ] = f;
71  }
72 
73  static void catFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
74  {
75  std::string sWord(word);
76 
77  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
78  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
79 
80  if( it != catDict->end() )
81  {
82  arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
83  it->second.second++;
84  }
85  else
86  {
87  int index = (int)(catDict->size());
88  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
89  arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
90  aux.ntFeat->categoryNumber = index + 1;
91  }
92  }
93 
94 protected:
95  template<class T>
96  static void readNumeric(const char *text, T &f)
97  {
98  f = daal::services::daal_string_to_float(text, 0);
99  }
100 
101  static void binFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
102  {
103  std::string sWord(word);
104 
105  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
106  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
107 
108  size_t index = 0;
109 
110  if( it != catDict->end() )
111  {
112  index = it->second.first;
113  it->second.second++;
114  }
115  else
116  {
117  index = catDict->size();
118  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((int)index, 1) ) );
119  aux.ntFeat->categoryNumber = index + 1;
120  }
121 
122  size_t nCats = aux.nCats;
123 
124  for(size_t i=0; i<nCats; i++)
125  {
126  arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
127  }
128  }
129 
130  static void nullFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) {}
131 
132 };
133 
138 class MakeCategorical : public ModifierIface
139 {
140  size_t idx;
141 public:
142  MakeCategorical(size_t idx) : idx(idx) {}
143 
144  virtual ~MakeCategorical() {}
145 
146  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
147  {
148  size_t nCols = funcList.size();
149 
150  if(idx < nCols)
151  {
152  funcList[idx] = catFunc;
153  }
154  }
155 };
156 
161 class OneHotEncoder : public ModifierIface
162 {
163  size_t idx;
164  size_t nCats;
165 public:
166  OneHotEncoder(size_t idx, size_t nCats) : idx(idx), nCats(nCats) {}
167 
168  virtual ~OneHotEncoder() {}
169 
170  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
171  {
172  size_t nCols = funcList.size();
173 
174  if(idx < nCols)
175  {
176  funcList[idx] = binFunc;
177  auxVect[idx].nCats = nCats;
178  auxVect[idx].wide = nCats;
179  }
180 
181  size_t nNTCols = 0;
182  for(size_t i=0; i<nCols; i++)
183  {
184  auxVect[i].idx = nNTCols;
185  nNTCols += auxVect[i].wide;
186  }
187  }
188 };
189 
194 class ColumnFilter : public ModifierIface
195 {
196  bool oddFlag;
197  bool evenFlag;
198  bool noneFlag;
199  bool listFlag;
200  services::Collection<size_t> validList;
201 public:
202  ColumnFilter() : oddFlag(false), evenFlag(false), noneFlag(false), listFlag(false) {}
203 
204  virtual ~ColumnFilter() {}
205 
206  ColumnFilter& odd() { oddFlag=true; return *this;}
207  ColumnFilter& even() {evenFlag=true; return *this;}
208  ColumnFilter& none() {noneFlag=true; return *this;}
209  ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=true; return *this;}
210 
211  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
212  {
213  size_t nCols = funcList.size();
214 
215  if( oddFlag )
216  {
217  for(size_t i=0; i<nCols; i+=2)
218  {
219  funcList[i] = nullFunc;
220  auxVect[i].wide = 0;
221  }
222  }
223 
224  if( evenFlag )
225  {
226  for(size_t i=1; i<nCols; i+=2)
227  {
228  funcList[i] = nullFunc;
229  auxVect[i].wide = 0;
230  }
231  }
232 
233  if( noneFlag )
234  {
235  for(size_t i=0; i<nCols; i++)
236  {
237  funcList[i] = nullFunc;
238  auxVect[i].wide = 0;
239  }
240  }
241 
242  if( listFlag )
243  {
244  services::Collection<bool> flags(nCols);
245 
246  for(size_t i=0; i<nCols; i++)
247  {
248  flags[i] = false;
249  }
250 
251  for(size_t i=0; i<validList.size(); i++)
252  {
253  size_t el = validList[i];
254  if(el<nCols)
255  {
256  flags[el] = true;
257  }
258  }
259 
260  for(size_t i=0; i<nCols; i++)
261  {
262  if(flags[i]) continue;
263  funcList[i] = nullFunc;
264  auxVect[i].wide = 0;
265  }
266  }
267 
268  size_t nNTCols = 0;
269  for(size_t i=0; i<nCols; i++)
270  {
271  auxVect[i].idx = nNTCols;
272  nNTCols += auxVect[i].wide;
273  }
274  }
275 };
276 
277 namespace interface1
278 {
289 class CSVFeatureManager : public StringRowFeatureManagerIface
290 {
291 protected:
292  char _delimiter;
293 
294  services::Collection<functionT> funcList;
295  services::Collection<FeatureAuxData> auxVect;
296 
297 public:
301  CSVFeatureManager() : _delimiter(',') {}
302 
303  virtual ~CSVFeatureManager() {}
304 
308  void setDelimiter( char delimiter )
309  {
310  _delimiter = delimiter;
311  }
312 
313 public:
314  size_t getNumericTableNumberOfColumns()
315  {
316  size_t nDSCols = auxVect.size();
317  return auxVect[nDSCols-1].idx + auxVect[nDSCols-1].wide;
318  }
319 
320  virtual void parseRowAsDictionary( char *rawRowData, size_t rawDataSize,
321  DataSourceDictionary *dict ) DAAL_C11_OVERRIDE
322  {
323  char *word = new char[rawDataSize + 1];
324 
325  std::list<DataSourceFeature> featureList;
326 
327  bool isEmpty = false;
328  size_t nCols = 0;
329  size_t pos = 0;
330  while (true)
331  {
332  if (rawRowData[pos] == '\0') { break; }
333  size_t len = 0;
334 
335  while (len < rawDataSize && rawRowData[pos] != _delimiter && rawRowData[pos] != '\0')
336  {
337  word[len] = rawRowData[pos];
338  len++;
339  pos++;
340  }
341 
342  word[len] = '\0';
343 
344  if (rawRowData[pos] == _delimiter) pos++;
345 
346  DAAL_DATA_TYPE f;
347  isEmpty = (word[0] == 0 || word[0] == '\r' || word[0] == '\n');
348 
349  if (isEmpty) { break; }
350 
351  bool isNumeric = readNumericDetailed<>( word, f );
352 
353  DataSourceFeature feat;
354 
355  if( isNumeric )
356  {
357  feat.setType<DAAL_DATA_TYPE>();
358  }
359  else
360  {
361  feat.setType<int>();
362  feat.ntFeature.featureType = data_feature_utils::DAAL_CATEGORICAL;
363  }
364 
365  featureList.push_back(feat);
366 
367  nCols++;
368  }
369 
370  delete[] word;
371 
372  dict->setNumberOfFeatures(nCols);
373 
374  size_t idx = 0;
375  for( std::list<DataSourceFeature>::iterator it = featureList.begin() ; it != featureList.end() ; it++ )
376  {
377  dict->setFeature( *it, idx );
378  idx++;
379  if( idx == nCols ) { break; }
380  }
381 
382  initializeFeatureDetails(dict);
383  }
384 
385  void setFeatureDetailsFromDictionary(DataSourceDictionary* dict)
386  {
387  initializeFeatureDetails(dict);
388  }
389 
390  void addModifier( const ModifierIface& modifier )
391  {
392  modifier.apply( funcList, auxVect );
393  }
394 
403  virtual void parseRowIn ( char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict,
404  NumericTable *nt, size_t ntRowIndex ) DAAL_C11_OVERRIDE
405  {
406  size_t dFeatures = auxVect.size();
407 
408  char const **words = new char const *[dFeatures];
409 
410  if(!words)
411  {
412  return;
413  }
414 
415  nt->getBlockOfRows( ntRowIndex, 1, writeOnly, block );
416  DAAL_DATA_TYPE *row = block.getBlockPtr();
417 
418  size_t pos = 0;
419  words[ pos ] = rawRowData;
420 
421  for( size_t i=0; i<rawDataSize; i++ )
422  {
423  if( rawRowData[i] == _delimiter )
424  {
425  rawRowData[i] = 0;
426  pos++;
427  if(pos < dFeatures)
428  {
429  words[pos] = rawRowData + i + 1;
430  }
431  }
432  }
433  rawRowData[rawDataSize] = 0;
434 
435  const char* zeroStr = "0";
436  for( pos++; pos<dFeatures; pos++ )
437  {
438  words[pos] = zeroStr;
439  }
440 
441  for( size_t i = 0; i < dFeatures; i++ )
442  {
443  funcList[i]( words[i], auxVect[i], row );
444  }
445 
446  nt->releaseBlockOfRows( block );
447 
448  delete[] words;
449  }
450 
451 protected:
452  BlockDescriptor<DAAL_DATA_TYPE> block;
453 
454  template<class T>
455  bool readNumericDetailed(char *text, T &f)
456  {
457  std::istringstream iss(text);
458  iss >> f;
459  return !(iss.fail());
460  }
461 
462  void initializeFeatureDetails(DataSourceDictionary* dict)
463  {
464  const size_t nCols = dict->getNumberOfFeatures();
465  funcList.resize(nCols);
466  auxVect.resize(nCols);
467 
468  for(size_t i=0; i<nCols; i++)
469  {
470  if( (*dict)[i].ntFeature.featureType == data_feature_utils::DAAL_CONTINUOUS )
471  {
472  funcList.push_back( ModifierIface::contFunc );
473  }
474  else
475  {
476  funcList.push_back( ModifierIface::catFunc );
477  }
478  auxVect.push_back( FeatureAuxData() );
479  auxVect[i].idx = i;
480  auxVect[i].dsFeat = &(*dict)[i];
481  auxVect[i].ntFeat = &auxVect[i].dsFeat->ntFeature;
482  }
483  }
484 };
486 } // namespace interface1
487 using interface1::CSVFeatureManager;
488 
489 }
490 }
491 #endif
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:161
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:320
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:69
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:194
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:43
daal
Definition: algorithm_base_common.h:31
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:403
daal::data_management::interface1::DataSourceFeature::setType
void setType()
Definition: data_source_dictionary.h:128
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:44
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:138
daal::data_management::interface1::DataSourceFeature
Data structure that describes the Data Source feature.
Definition: data_source_dictionary.h:49
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:289
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:308
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:59
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:574
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:301
daal::data_management::interface1::BlockDescriptor< DAAL_DATA_TYPE >
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:158

For more complete information about compiler optimizations, see our Optimization Notice.