C++ API Reference for Intel® Data Analytics Acceleration Library 2018 Update 1

csv_data_source.h
1 /* file: csv_data_source.h */
2 /*******************************************************************************
3 * Copyright 2014-2017 Intel Corporation
4 * All Rights Reserved.
5 *
6 * If this software was obtained under the Intel Simplified Software License,
7 * the following terms apply:
8 *
9 * The source code, information and material ("Material") contained herein is
10 * owned by Intel Corporation or its suppliers or licensors, and title to such
11 * Material remains with Intel Corporation or its suppliers or licensors. The
12 * Material contains proprietary information of Intel or its suppliers and
13 * licensors. The Material is protected by worldwide copyright laws and treaty
14 * provisions. No part of the Material may be used, copied, reproduced,
15 * modified, published, uploaded, posted, transmitted, distributed or disclosed
16 * in any way without Intel's prior express written permission. No license under
17 * any patent, copyright or other intellectual property rights in the Material
18 * is granted to or conferred upon you, either expressly, by implication,
19 * inducement, estoppel or otherwise. Any license under such intellectual
20 * property rights must be express and approved by Intel in writing.
21 *
22 * Unless otherwise agreed by Intel in writing, you may not remove or alter this
23 * notice or any other notice embedded in Materials by Intel or Intel's
24 * suppliers or licensors in any way.
25 *
26 *
27 * If this software was obtained under the Apache License, Version 2.0 (the
28 * "License"), the following terms apply:
29 *
30 * You may not use this file except in compliance with the License. You may
31 * obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
32 *
33 *
34 * Unless required by applicable law or agreed to in writing, software
35 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
36 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 *
38 * See the License for the specific language governing permissions and
39 * limitations under the License.
40 *******************************************************************************/
41 
42 /*
43 //++
44 // Implementation of the file data source class.
45 //--
46 */
47 
48 #ifndef __CSV_DATA_SOURCE_H__
49 #define __CSV_DATA_SOURCE_H__
50 
51 #include "services/daal_memory.h"
52 #include "data_management/data_source/data_source.h"
53 #include "data_management/data/data_dictionary.h"
54 #include "data_management/data/numeric_table.h"
55 #include "data_management/data/homogen_numeric_table.h"
56 
57 namespace daal
58 {
59 namespace data_management
60 {
61 
62 namespace interface1
63 {
73 template< typename _featureManager, typename _summaryStatisticsType = DAAL_SUMMARY_STATISTICS_TYPE >
74 class CsvDataSource : public DataSourceTemplate<data_management::HomogenNumericTable<DAAL_DATA_TYPE>, _summaryStatisticsType>
75 {
76 public:
77  using DataSource::checkDictionary;
78  using DataSource::checkNumericTable;
79  using DataSource::freeNumericTable;
80  using DataSource::_dict;
81  using DataSource::_initialMaxRows;
82 
86  typedef _featureManager FeatureManager;
87 
88 protected:
89  typedef data_management::HomogenNumericTable<DAAL_DATA_TYPE> DefaultNumericTableType;
90 
91  FeatureManager featureManager;
92 
93 public:
102  CsvDataSource(DataSourceIface::NumericTableAllocationFlag doAllocateNumericTable = DataSource::notAllocateNumericTable,
103  DataSourceIface::DictionaryCreationFlag doCreateDictionaryFromContext = DataSource::notDictionaryFromContext,
104  size_t initialMaxRows = 10):
105  DataSourceTemplate<DefaultNumericTableType, _summaryStatisticsType>(doAllocateNumericTable, doCreateDictionaryFromContext),
106  _rawLineBuffer(NULL),
107  _rawLineLength(0)
108  {
109  _rawLineBufferLen = 1024;
110  _rawLineBuffer = (char *)daal::services::daal_malloc(_rawLineBufferLen);
111 
112  _contextDictFlag = false;
113  _initialMaxRows = initialMaxRows;
114  }
115 
116  ~CsvDataSource()
117  {
118  daal::services::daal_free( _rawLineBuffer );
119  DataSourceTemplate<DefaultNumericTableType, _summaryStatisticsType>::freeNumericTable();
120  }
121 
126  FeatureManager &getFeatureManager()
127  {
128  return featureManager;
129  }
130 
131 public:
132  size_t getNumericTableNumberOfColumns() DAAL_C11_OVERRIDE
133  {
134  return featureManager.getNumericTableNumberOfColumns();
135  }
136 
137  services::Status setDictionary(DataSourceDictionary *dict) DAAL_C11_OVERRIDE
138  {
139  services::Status s = DataSource::setDictionary(dict);
140  featureManager.setFeatureDetailsFromDictionary(dict);
141 
142  return s;
143  }
144 
145  size_t loadDataBlock(NumericTable* nt) DAAL_C11_OVERRIDE
146  {
147  services::Status s = checkDictionary();
148  if(!s)
149  {
150  this->_status.add(services::throwIfPossible(s));
151  return 0;
152  }
153  s = checkInputNumericTable(nt);
154  if(!s)
155  {
156  this->_status.add(services::throwIfPossible(s));
157  return 0;
158  }
159 
160  size_t maxRows = (_initialMaxRows > 0 ? _initialMaxRows : 10);
161  size_t nrows = 0;
162  const size_t ncols = getNumericTableNumberOfColumns();
163  DataCollection tables;
164  for( ;; maxRows *= 2)
165  {
166  NumericTablePtr ntCurrent = HomogenNumericTable<DAAL_DATA_TYPE>::create(ncols, maxRows, NumericTableIface::doAllocate, &s);
167  if (!s)
168  {
169  this->_status.add(services::throwIfPossible(services::Status(services::ErrorNumericTableNotAllocated)));
170  break;
171  }
172  tables.push_back(ntCurrent);
173  const size_t rows = loadDataBlock(maxRows, ntCurrent.get());
174  nrows += rows;
175  if (rows < maxRows)
176  break;
177  }
178 
179  s = resetNumericTable(nt, nrows);
180  if(!s)
181  {
182  this->_status.add(services::throwIfPossible(s));
183  return 0;
184  }
185 
186  BlockDescriptor<DAAL_DATA_TYPE> blockCurrent, block;
187  size_t pos = 0;
188  for (size_t i = 0; i < tables.size(); i++)
189  {
190  NumericTable *ntCurrent = (NumericTable*)(tables[i].get());
191  size_t rows = ntCurrent->getNumberOfRows();
192 
193  if(!rows)
194  continue;
195 
196  ntCurrent->getBlockOfRows(0, rows, readOnly, blockCurrent);
197  nt->getBlockOfRows(pos, rows, writeOnly, block);
198 
199  services::daal_memcpy_s(block.getBlockPtr(), rows * ncols * sizeof(DAAL_DATA_TYPE), blockCurrent.getBlockPtr(), rows * ncols * sizeof(DAAL_DATA_TYPE));
200 
201  ntCurrent->releaseBlockOfRows(blockCurrent);
202  nt->releaseBlockOfRows(block);
203 
204  DataSourceTemplate<DefaultNumericTableType, _summaryStatisticsType>::combineStatistics( ntCurrent, nt, pos == 0);
205  pos += rows;
206  }
207  return nrows;
208  }
209 
210  size_t loadDataBlock(size_t maxRows, NumericTable* nt) DAAL_C11_OVERRIDE
211  {
212  size_t nLines = loadDataBlock(maxRows, 0, maxRows, nt);
213  nt->resize( nLines );
214  return nLines;
215  }
216 
217  size_t loadDataBlock(size_t maxRows, size_t rowOffset, size_t fullRows, NumericTable *nt) DAAL_C11_OVERRIDE
218  {
219  services::Status s = checkDictionary();
220  if(!s)
221  {
222  this->_status.add(services::throwIfPossible(s));
223  return 0;
224  }
225  s = checkInputNumericTable(nt);
226  if(!s)
227  {
228  this->_status.add(services::throwIfPossible(s));
229  return 0;
230  }
231 
232  if (rowOffset + maxRows > fullRows)
233  {
234  this->_status.add(services::throwIfPossible(services::ErrorIncorrectDataRange));
235  return 0;
236  }
237 
238  s = resetNumericTable(nt, fullRows);
239  if(!s)
240  {
241  this->_status.add(services::throwIfPossible(s));
242  return 0;
243  }
244 
245  size_t j = 0;
246  for(; j < maxRows && !iseof() ; j++ )
247  {
248  s = readLine();
249  if(!s || !_rawLineLength)
250  break;
251  featureManager.parseRowIn( _rawLineBuffer, _rawLineLength, this->_dict.get(), nt, rowOffset + j );
252 
253  DataSourceTemplate<DefaultNumericTableType, _summaryStatisticsType>::updateStatistics( j, nt, rowOffset );
254  }
255 
256  return rowOffset + j;
257  }
258 
259  size_t loadDataBlock() DAAL_C11_OVERRIDE
260  {
261  return DataSource::loadDataBlock();
262  }
263 
264  size_t loadDataBlock(size_t maxRows) DAAL_C11_OVERRIDE
265  {
266  return DataSource::loadDataBlock(maxRows);
267  }
268 
269  size_t loadDataBlock(size_t maxRows, size_t rowOffset, size_t fullRows) DAAL_C11_OVERRIDE
270  {
271  return DataSource::loadDataBlock(maxRows, rowOffset, fullRows);
272  }
273 
274 
275  services::Status createDictionaryFromContext() DAAL_C11_OVERRIDE
276  {
277  if(_dict)
278  return services::throwIfPossible(services::Status(services::ErrorDictionaryAlreadyAvailable));
279 
280  _contextDictFlag = true;
281  services::Status s;
282  _dict = DataSourceDictionary::create(&s);
283  if(!s) return s;
284 
285  s = readLine();
286  if(!s)
287  {
288  this->_dict.reset();
289  return services::throwIfPossible(s);
290  }
291 
292  featureManager.parseRowAsDictionary( _rawLineBuffer, _rawLineLength, this->_dict.get() );
293  return services::Status();
294  }
295 
296  size_t getNumberOfAvailableRows() DAAL_C11_OVERRIDE
297  {
298  return 0;
299  }
300 
301 protected:
302  virtual bool iseof() const = 0;
303  virtual services::Status readLine() = 0;
304 
305  virtual services::Status resetNumericTable(NumericTable *nt, const size_t newSize)
306  {
307  services::Status s;
308 
309  NumericTableDictionaryPtr ntDict = nt->getDictionarySharedPtr();
310  const size_t nFeatures = getNumericTableNumberOfColumns();
311  ntDict->setNumberOfFeatures(nFeatures);
312  for (size_t i = 0; i < nFeatures; i++)
313  ntDict->setFeature((*_dict)[i].ntFeature, i);
314 
315  s = DataSourceTemplate<DefaultNumericTableType, _summaryStatisticsType>::resizeNumericTableImpl(newSize, nt);
316  if(!s)
317  {
318  return s;
319  }
320 
321  nt->setNormalizationFlag(NumericTable::nonNormalized);
322  return services::Status();
323  }
324 
325  virtual services::Status checkInputNumericTable(const NumericTable* const nt) const
326  {
327  if(!nt)
328  {
329  return services::Status(services::ErrorNullInputNumericTable);
330  }
331 
332  const NumericTable::StorageLayout layout = nt->getDataLayout();
333  if (layout == NumericTable::csrArray)
334  {
335  return services::Status(services::ErrorIncorrectTypeOfInputNumericTable);
336  }
337 
338  return services::Status();
339  }
340 
341  bool enlargeBuffer()
342  {
343  int newRawLineBufferLen = _rawLineBufferLen * 2;
344  char* newRawLineBuffer = (char *)daal::services::daal_malloc( newRawLineBufferLen );
345  if(newRawLineBuffer == 0)
346  return false;
347  daal::services::daal_memcpy_s(newRawLineBuffer, newRawLineBufferLen, _rawLineBuffer, _rawLineBufferLen);
348  daal::services::daal_free( _rawLineBuffer );
349  _rawLineBuffer = newRawLineBuffer;
350  _rawLineBufferLen = newRawLineBufferLen;
351  return true;
352  }
353 
354 protected:
355  char *_rawLineBuffer;
356  int _rawLineBufferLen;
357  int _rawLineLength;
358 
359  bool _contextDictFlag;
360 };
362 } // namespace interface1
363 using interface1::CsvDataSource;
364 
365 }
366 }
367 #endif
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock(size_t maxRows, size_t rowOffset, size_t fullRows, NumericTable *nt) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:217
daal::data_management::interface1::DataSource::checkDictionary
services::Status checkDictionary()
Definition: data_source.h:385
daal::services::ErrorDictionaryAlreadyAvailable
Definition: error_indexes.h:175
daal::services::ErrorNullInputNumericTable
Definition: error_indexes.h:107
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:95
daal::data_management::interface1::CsvDataSource::setDictionary
services::Status setDictionary(DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:137
daal
Definition: algorithm_base_common.h:57
daal::services::ErrorNumericTableNotAllocated
Definition: error_indexes.h:179
daal::data_management::interface1::CsvDataSource::createDictionaryFromContext
services::Status createDictionaryFromContext() DAAL_C11_OVERRIDE
Definition: csv_data_source.h:275
daal::data_management::interface1::DataCollection
Class that provides functionality of Collection container for objects derived from SerializationIface...
Definition: data_collection.h:71
daal::data_management::interface1::NumericTable::getDictionarySharedPtr
virtual NumericTableDictionaryPtr getDictionarySharedPtr() const DAAL_C11_OVERRIDE
Definition: numeric_table.h:658
daal::data_management::interface1::CsvDataSource::getNumberOfAvailableRows
size_t getNumberOfAvailableRows() DAAL_C11_OVERRIDE
Definition: csv_data_source.h:296
daal::data_management::interface1::DataSourceIface::DictionaryCreationFlag
DictionaryCreationFlag
Specifies whether a Data Dictionary is created from the context of a Data Source. ...
Definition: data_source.h:95
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock(NumericTable *nt) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:145
daal::data_management::interface1::DataSourceIface::notAllocateNumericTable
Definition: data_source.h:107
daal::data_management::interface1::DataSourceIface::freeNumericTable
virtual void freeNumericTable()=0
daal::data_management::interface1::DataCollection::push_back
DataCollection & push_back(const SerializationIfacePtr &x)
daal::data_management::interface1::DataSourceIface::doAllocateNumericTable
Definition: data_source.h:108
daal::data_management::interface1::DataSourceTemplate
Implements the abstract DataSourceIface interface.
Definition: data_source.h:489
daal::services::daal_memcpy_s
DAAL_EXPORT void daal_memcpy_s(void *dest, size_t numberOfElements, const void *src, size_t count)
daal::data_management::interface1::DataSource::checkNumericTable
services::Status checkNumericTable()
Definition: data_source.h:371
daal::data_management::interface1::NumericTable::setNormalizationFlag
NormalizationType setNormalizationFlag(NormalizationType flag)
Definition: numeric_table.h:762
daal::data_management::interface1::CsvDataSource
Specifies methods to access data stored in files.
Definition: csv_data_source.h:74
daal::data_management::interface1::NumericTableIface::nonNormalized
Definition: numeric_table.h:343
daal::services::daal_malloc
DAAL_EXPORT void * daal_malloc(size_t size, size_t alignment=DAAL_MALLOC_DEFAULT_ALIGNMENT)
daal::data_management::interface1::HomogenNumericTable::create
static services::SharedPtr< HomogenNumericTable< DataType > > create(NumericTableDictionaryPtr ddictForHomogenNumericTable, services::Status *stat=NULL)
Definition: homogen_numeric_table.h:117
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:600
daal::data_management::interface1::NumericTableIface::StorageLayout
StorageLayout
Storage layouts that may need to be supported.
Definition: numeric_table.h:352
daal::data_management::interface1::DataSourceTemplate::freeNumericTable
void freeNumericTable() DAAL_C11_OVERRIDE
Definition: data_source.h:535
daal::data_management::interface1::DataSource::loadDataBlock
size_t loadDataBlock() DAAL_C11_OVERRIDE
Definition: data_source.h:298
daal::data_management::interface1::CsvDataSource::getFeatureManager
FeatureManager & getFeatureManager()
Definition: csv_data_source.h:126
daal::data_management::interface1::CsvDataSource::CsvDataSource
CsvDataSource(DataSourceIface::NumericTableAllocationFlag doAllocateNumericTable=DataSource::notAllocateNumericTable, DataSourceIface::DictionaryCreationFlag doCreateDictionaryFromContext=DataSource::notDictionaryFromContext, size_t initialMaxRows=10)
Definition: csv_data_source.h:102
daal::services::ErrorIncorrectTypeOfInputNumericTable
Definition: error_indexes.h:117
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock() DAAL_C11_OVERRIDE
Definition: csv_data_source.h:259
daal::services::daal_free
DAAL_EXPORT void daal_free(void *ptr)
daal::services::ErrorIncorrectDataRange
Definition: error_indexes.h:103
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock(size_t maxRows, size_t rowOffset, size_t fullRows) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:269
daal::data_management::interface1::DataCollection::size
size_t size() const
daal::data_management::interface1::NumericTableIface::doAllocate
Definition: numeric_table.h:313
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock(size_t maxRows) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:264
daal::data_management::interface1::NumericTable::getNumberOfRows
size_t getNumberOfRows() const
Definition: numeric_table.h:686
daal::data_management::interface1::DataSourceIface::notDictionaryFromContext
Definition: data_source.h:97
daal::data_management::interface1::BlockDescriptor< DAAL_DATA_TYPE >
daal::data_management::interface1::CsvDataSource::loadDataBlock
size_t loadDataBlock(size_t maxRows, NumericTable *nt) DAAL_C11_OVERRIDE
Definition: csv_data_source.h:210
daal::data_management::interface1::DenseNumericTableIface::releaseBlockOfRows
virtual services::Status releaseBlockOfRows(BlockDescriptor< double > &block)=0
daal::data_management::interface1::DataSourceIface::NumericTableAllocationFlag
NumericTableAllocationFlag
Specifies whether a Numeric Table is allocated inside of the Data Source object.
Definition: data_source.h:105
daal::data_management::interface1::Dictionary::create
static services::SharedPtr< Dictionary > create(size_t nfeat, FeaturesEqual featuresEqual=notEqual, services::Status *stat=NULL)
Definition: data_dictionary.h:209
daal::data_management::interface1::CsvDataSource::FeatureManager
_featureManager FeatureManager
Definition: csv_data_source.h:86
daal::data_management::interface1::DataSource::setDictionary
services::Status setDictionary(DataSourceDictionary *dict) DAAL_C11_OVERRIDE
Definition: data_source.h:243
daal::data_management::interface1::CsvDataSource::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() DAAL_C11_OVERRIDE
Definition: csv_data_source.h:132
daal::data_management::interface1::DenseNumericTableIface::getBlockOfRows
virtual services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor< double > &block)=0
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:184

For more complete information about compiler optimizations, see our Optimization Notice.