C++ API Reference for Intel® Data Analytics Acceleration Library 2018 Update 1

csv_feature_manager.h
1 /* file: csv_feature_manager.h */
2 /*******************************************************************************
3 * Copyright 2014-2017 Intel Corporation
4 * All Rights Reserved.
5 *
6 * If this software was obtained under the Intel Simplified Software License,
7 * the following terms apply:
8 *
9 * The source code, information and material ("Material") contained herein is
10 * owned by Intel Corporation or its suppliers or licensors, and title to such
11 * Material remains with Intel Corporation or its suppliers or licensors. The
12 * Material contains proprietary information of Intel or its suppliers and
13 * licensors. The Material is protected by worldwide copyright laws and treaty
14 * provisions. No part of the Material may be used, copied, reproduced,
15 * modified, published, uploaded, posted, transmitted, distributed or disclosed
16 * in any way without Intel's prior express written permission. No license under
17 * any patent, copyright or other intellectual property rights in the Material
18 * is granted to or conferred upon you, either expressly, by implication,
19 * inducement, estoppel or otherwise. Any license under such intellectual
20 * property rights must be express and approved by Intel in writing.
21 *
22 * Unless otherwise agreed by Intel in writing, you may not remove or alter this
23 * notice or any other notice embedded in Materials by Intel or Intel's
24 * suppliers or licensors in any way.
25 *
26 *
27 * If this software was obtained under the Apache License, Version 2.0 (the
28 * "License"), the following terms apply:
29 *
30 * You may not use this file except in compliance with the License. You may
31 * obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
32 *
33 *
34 * Unless required by applicable law or agreed to in writing, software
35 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
36 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 *
38 * See the License for the specific language governing permissions and
39 * limitations under the License.
40 *******************************************************************************/
41 
42 /*
43 //++
44 // Implementation of the CSV feature manager class.
45 //--
46 */
47 
48 #ifndef __CSV_FEATURE_MANAGER_H__
49 #define __CSV_FEATURE_MANAGER_H__
50 
51 #include <sstream>
52 #include <list>
53 
54 #include "services/daal_memory.h"
55 #include "data_management/data_source/data_source.h"
56 #include "data_management/data_source/data_source_dictionary.h"
57 #include "data_management/data/numeric_table.h"
58 #include "data_management/data/homogen_numeric_table.h"
59 
60 namespace daal
61 {
62 namespace data_management
63 {
64 
69 struct FeatureAuxData
70 {
71  FeatureAuxData() : idx(0), wide(1), dsFeat(0), ntFeat(0), nCats(0) {};
72  size_t idx;
73  size_t wide;
74  size_t nCats;
75  DataSourceFeature *dsFeat;
76  NumericTableFeature *ntFeat;
77 };
78 
79 typedef void (*functionT)(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
80 
85 class ModifierIface
86 {
87 public:
88  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const = 0;
89 
90  virtual ~ModifierIface() {}
91 
92  static void contFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
93  {
94  DAAL_DATA_TYPE f;
95  readNumeric<>( word, f );
96  arr[ aux.idx ] = f;
97  }
98 
99  static void catFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
100  {
101  std::string sWord(word);
102 
103  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
104  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
105 
106  if( it != catDict->end() )
107  {
108  arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
109  it->second.second++;
110  }
111  else
112  {
113  int index = (int)(catDict->size());
114  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>(index, 1) ) );
115  arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
116  aux.ntFeat->categoryNumber = index + 1;
117  }
118  }
119 
120 protected:
121  template<class T>
122  static void readNumeric(const char *text, T &f)
123  {
124  f = daal::services::daal_string_to_float(text, 0);
125  }
126 
127  static void binFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
128  {
129  std::string sWord(word);
130 
131  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
132  CategoricalFeatureDictionary::iterator it = catDict->find( sWord );
133 
134  size_t index = 0;
135 
136  if( it != catDict->end() )
137  {
138  index = it->second.first;
139  it->second.second++;
140  }
141  else
142  {
143  index = catDict->size();
144  catDict->insert( std::pair<std::string, std::pair<int, int> >( sWord, std::pair<int, int>((int)index, 1) ) );
145  aux.ntFeat->categoryNumber = index + 1;
146  }
147 
148  size_t nCats = aux.nCats;
149 
150  for(size_t i=0; i<nCats; i++)
151  {
152  arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
153  }
154  }
155 
156  static void nullFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) {}
157 
158 };
159 
164 class MakeCategorical : public ModifierIface
165 {
166  size_t idx;
167 public:
168  MakeCategorical(size_t idx) : idx(idx) {}
169 
170  virtual ~MakeCategorical() {}
171 
172  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
173  {
174  size_t nCols = funcList.size();
175 
176  if(idx < nCols)
177  {
178  funcList[idx] = catFunc;
179  }
180  }
181 };
182 
187 class OneHotEncoder : public ModifierIface
188 {
189  size_t idx;
190  size_t nCats;
191 public:
192  OneHotEncoder(size_t idx, size_t nCats) : idx(idx), nCats(nCats) {}
193 
194  virtual ~OneHotEncoder() {}
195 
196  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
197  {
198  size_t nCols = funcList.size();
199 
200  if(idx < nCols)
201  {
202  funcList[idx] = binFunc;
203  auxVect[idx].nCats = nCats;
204  auxVect[idx].wide = nCats;
205  }
206 
207  size_t nNTCols = 0;
208  for(size_t i=0; i<nCols; i++)
209  {
210  auxVect[i].idx = nNTCols;
211  nNTCols += auxVect[i].wide;
212  }
213  }
214 };
215 
220 class ColumnFilter : public ModifierIface
221 {
222  bool oddFlag;
223  bool evenFlag;
224  bool noneFlag;
225  bool listFlag;
226  services::Collection<size_t> validList;
227 public:
228  ColumnFilter() : oddFlag(false), evenFlag(false), noneFlag(false), listFlag(false) {}
229 
230  virtual ~ColumnFilter() {}
231 
232  ColumnFilter& odd() { oddFlag=true; return *this;}
233  ColumnFilter& even() {evenFlag=true; return *this;}
234  ColumnFilter& none() {noneFlag=true; return *this;}
235  ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=true; return *this;}
236 
237  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
238  {
239  size_t nCols = funcList.size();
240 
241  if( oddFlag )
242  {
243  for(size_t i=0; i<nCols; i+=2)
244  {
245  funcList[i] = nullFunc;
246  auxVect[i].wide = 0;
247  }
248  }
249 
250  if( evenFlag )
251  {
252  for(size_t i=1; i<nCols; i+=2)
253  {
254  funcList[i] = nullFunc;
255  auxVect[i].wide = 0;
256  }
257  }
258 
259  if( noneFlag )
260  {
261  for(size_t i=0; i<nCols; i++)
262  {
263  funcList[i] = nullFunc;
264  auxVect[i].wide = 0;
265  }
266  }
267 
268  if( listFlag )
269  {
270  services::Collection<bool> flags(nCols);
271 
272  for(size_t i=0; i<nCols; i++)
273  {
274  flags[i] = false;
275  }
276 
277  for(size_t i=0; i<validList.size(); i++)
278  {
279  size_t el = validList[i];
280  if(el<nCols)
281  {
282  flags[el] = true;
283  }
284  }
285 
286  for(size_t i=0; i<nCols; i++)
287  {
288  if(flags[i]) continue;
289  funcList[i] = nullFunc;
290  auxVect[i].wide = 0;
291  }
292  }
293 
294  size_t nNTCols = 0;
295  for(size_t i=0; i<nCols; i++)
296  {
297  auxVect[i].idx = nNTCols;
298  nNTCols += auxVect[i].wide;
299  }
300  }
301 };
302 
303 namespace interface1
304 {
315 class CSVFeatureManager : public StringRowFeatureManagerIface
316 {
317 protected:
318  char _delimiter;
319 
320  services::Collection<functionT> funcList;
321  services::Collection<FeatureAuxData> auxVect;
322 
323 public:
327  CSVFeatureManager() : _delimiter(',') {}
328 
329  virtual ~CSVFeatureManager() {}
330 
334  void setDelimiter( char delimiter )
335  {
336  _delimiter = delimiter;
337  }
338 
339 public:
340  size_t getNumericTableNumberOfColumns()
341  {
342  size_t nDSCols = auxVect.size();
343  return auxVect[nDSCols-1].idx + auxVect[nDSCols-1].wide;
344  }
345 
346  virtual void parseRowAsDictionary( char *rawRowData, size_t rawDataSize,
347  DataSourceDictionary *dict ) DAAL_C11_OVERRIDE
348  {
349  char *word = new char[rawDataSize + 1];
350 
351  std::list<DataSourceFeature> featureList;
352 
353  bool isEmpty = false;
354  size_t nCols = 0;
355  size_t pos = 0;
356  while (true)
357  {
358  if (rawRowData[pos] == '\0') { break; }
359  size_t len = 0;
360 
361  while (len < rawDataSize && rawRowData[pos] != _delimiter && rawRowData[pos] != '\0')
362  {
363  word[len] = rawRowData[pos];
364  len++;
365  pos++;
366  }
367 
368  word[len] = '\0';
369 
370  if (rawRowData[pos] == _delimiter) pos++;
371 
372  DAAL_DATA_TYPE f;
373  isEmpty = (word[0] == 0 || word[0] == '\r' || word[0] == '\n');
374 
375  if (isEmpty) { break; }
376 
377  bool isNumeric = readNumericDetailed<>( word, f );
378 
379  DataSourceFeature feat;
380 
381  if( isNumeric )
382  {
383  feat.setType<DAAL_DATA_TYPE>();
384  }
385  else
386  {
387  feat.setType<int>();
388  feat.ntFeature.featureType = data_feature_utils::DAAL_CATEGORICAL;
389  }
390 
391  featureList.push_back(feat);
392 
393  nCols++;
394  }
395 
396  delete[] word;
397 
398  dict->setNumberOfFeatures(nCols);
399 
400  size_t idx = 0;
401  for( std::list<DataSourceFeature>::iterator it = featureList.begin() ; it != featureList.end() ; it++ )
402  {
403  dict->setFeature( *it, idx );
404  idx++;
405  if( idx == nCols ) { break; }
406  }
407 
408  initializeFeatureDetails(dict);
409  }
410 
411  void setFeatureDetailsFromDictionary(DataSourceDictionary* dict)
412  {
413  initializeFeatureDetails(dict);
414  }
415 
416  void addModifier( const ModifierIface& modifier )
417  {
418  modifier.apply( funcList, auxVect );
419  }
420 
429  virtual void parseRowIn ( char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict,
430  NumericTable *nt, size_t ntRowIndex ) DAAL_C11_OVERRIDE
431  {
432  size_t dFeatures = auxVect.size();
433 
434  char const **words = new char const *[dFeatures];
435 
436  if(!words)
437  {
438  return;
439  }
440 
441  nt->getBlockOfRows( ntRowIndex, 1, writeOnly, block );
442  DAAL_DATA_TYPE *row = block.getBlockPtr();
443 
444  size_t pos = 0;
445  words[ pos ] = rawRowData;
446 
447  for( size_t i=0; i<rawDataSize; i++ )
448  {
449  if( rawRowData[i] == _delimiter )
450  {
451  rawRowData[i] = 0;
452  pos++;
453  if(pos < dFeatures)
454  {
455  words[pos] = rawRowData + i + 1;
456  }
457  }
458  }
459  rawRowData[rawDataSize] = 0;
460 
461  const char* zeroStr = "0";
462  for( pos++; pos<dFeatures; pos++ )
463  {
464  words[pos] = zeroStr;
465  }
466 
467  for( size_t i = 0; i < dFeatures; i++ )
468  {
469  funcList[i]( words[i], auxVect[i], row );
470  }
471 
472  nt->releaseBlockOfRows( block );
473 
474  delete[] words;
475  }
476 
477 protected:
478  BlockDescriptor<DAAL_DATA_TYPE> block;
479 
480  template<class T>
481  bool readNumericDetailed(char *text, T &f)
482  {
483  std::istringstream iss(text);
484  iss >> f;
485  return !(iss.fail());
486  }
487 
488  void initializeFeatureDetails(DataSourceDictionary* dict)
489  {
490  const size_t nCols = dict->getNumberOfFeatures();
491  funcList.resize(nCols);
492  auxVect.resize(nCols);
493 
494  for(size_t i=0; i<nCols; i++)
495  {
496  if( (*dict)[i].ntFeature.featureType == data_feature_utils::DAAL_CONTINUOUS )
497  {
498  funcList.push_back( ModifierIface::contFunc );
499  }
500  else
501  {
502  funcList.push_back( ModifierIface::catFunc );
503  }
504  auxVect.push_back( FeatureAuxData() );
505  auxVect[i].idx = i;
506  auxVect[i].dsFeat = &(*dict)[i];
507  auxVect[i].ntFeat = &auxVect[i].dsFeat->ntFeature;
508  }
509  }
510 };
512 } // namespace interface1
513 using interface1::CSVFeatureManager;
514 
515 }
516 }
517 #endif
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:187
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:346
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:95
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:220
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:69
daal
Definition: algorithm_base_common.h:57
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dict, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:429
daal::data_management::interface1::DataSourceFeature::setType
void setType()
Definition: data_source_dictionary.h:154
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:70
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:164
daal::data_management::interface1::DataSourceFeature
Data structure that describes the Data Source feature.
Definition: data_source_dictionary.h:75
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:315
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:334
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:85
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:600
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:327
daal::data_management::interface1::BlockDescriptor< DAAL_DATA_TYPE >
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:184

For more complete information about compiler optimizations, see our Optimization Notice.