Python* API Reference for Intel® Data Analytics Acceleration Library 2019 Update 5

simple_csv_feature_modifiers.py

1 # file: simple_csv_feature_modifiers.py
2 #===============================================================================
3 # Copyright 2014-2019 Intel Corporation.
4 #
5 # This software and the related documents are Intel copyrighted materials, and
6 # your use of them is governed by the express license under which they were
7 # provided to you (License). Unless the License provides otherwise, you may not
8 # use, modify, copy, publish, distribute, disclose or transmit this software or
9 # the related documents without Intel's prior written permission.
10 #
11 # This software and the related documents are provided as is, with no express
12 # or implied warranties, other than those that are expressly stated in the
13 # License.
14 #===============================================================================
15 
16 # ! Content:
17 # ! Python example of modifiers usage with file data source
18 # !*****************************************************************************
19 
20 #
21 
22 
23 #
24 
25 from daal.data_management import FileDataSource, CsvDataSourceOptions, modifiers, features
26 from daal.data_management.modifiers import csv
27 
28 import os, sys
29 utils_folder = os.path.realpath(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
30 if utils_folder not in sys.path:
31  sys.path.insert(0, utils_folder)
32 from utils import printNumericTable
33 
34 # Path to the CSV to be read
35 csvFileName = "../data/batch/mixed_text_and_numbers.csv"
36 
37 # Define options for CSV data source
38 csvOptions = CsvDataSourceOptions(CsvDataSourceOptions.allocateNumericTable |\
39  CsvDataSourceOptions.createDictionaryFromContext |\
40  CsvDataSourceOptions.parseHeader)
41 
42 # Read CSV using default data source behavior
43 def readDefault():
44  ds = FileDataSource(csvFileName, csvOptions)
45  # By default all numeric columns will be parsed as continuous
46  # features and other columns as categorical
47  ds.loadDataBlock()
48  printNumericTable(ds.getNumericTable(), "readDefault function result:")
49 
50 
51 # Read CSV and do basic filtering using columns indices
52 def readOnlySpecifiedColumnIndices():
53  ds = FileDataSource(csvFileName, csvOptions)
54  # This means that columns with indices 0, 1, 5 will be included to the output numeric
55  # table and other columns will be ignored. The first argument of method 'include' specifies
56  # the set of columns and the second one specifies modifier. in this case we use predefined
57  # automatic modifier that automatically decides how to parse column in the best way
58  print(modifiers.csv.automatic())
59  ds.getFeatureManager().addModifier([0,1,5], modifiers.csv.automatic())
60  ds.loadDataBlock()
61  printNumericTable(ds.getNumericTable(), "readOnlySpecifiedColumnIndices function result:")
62 
63 
64 # Read CSV and do basic filtering using columns names
65 def readOnlySpecifiedColumnNames():
66  ds = FileDataSource(csvFileName, csvOptions)
67  # The same as readOnlySpecifiedColumnIndices but uses column names instead of indices
68  ds.getFeatureManager().addModifier(["Numeric1", "Categorical0"], modifiers.csv.automatic())
69  ds.loadDataBlock()
70  printNumericTable(ds.getNumericTable(), "readOnlySpecifiedColumnNames function result:")
71 
72 
73 # Read CSV using multiple modifiers
74 def readUsingMultipleModifiers():
75  ds = FileDataSource(csvFileName, csvOptions)
76 
77  fm = ds.getFeatureManager()
78  fm.addModifier(["Numeric1"], modifiers.csv.continuous())
79  # let's mix position and names
80  fm.addModifier([6, "Categorical1"], modifiers.csv.categorical())
81 
82  ds.loadDataBlock()
83  printNumericTable(ds.getNumericTable(), "readUsingMultipleModifiers function result:")
84 
85 
86 if __name__ == "__main__":
87  # Read CSV using default data source behavior
88  readDefault()
89 
90  # Read CSV and do basic filtering using columns indices
91  readOnlySpecifiedColumnIndices()
92 
93  # Read CSV and do basic filtering using columns names
94  readOnlySpecifiedColumnNames()
95 
96  # Read CSV using multiple modifiers
97  readUsingMultipleModifiers()

For more complete information about compiler optimizations, see our Optimization Notice.