Python* API Reference for Intel® Data Analytics Acceleration Library 2019

df_cls_traverse_model.py

1 # file: df_cls_traverse_model.py
2 #===============================================================================
3 # Copyright 2014-2018 Intel Corporation.
4 #
5 # This software and the related documents are Intel copyrighted materials, and
6 # your use of them is governed by the express license under which they were
7 # provided to you (License). Unless the License provides otherwise, you may not
8 # use, modify, copy, publish, distribute, disclose or transmit this software or
9 # the related documents without Intel's prior written permission.
10 #
11 # This software and the related documents are provided as is, with no express
12 # or implied warranties, other than those that are expressly stated in the
13 # License.
14 #===============================================================================
15 
16 #
17 # ! Content:
18 # ! Python example of decision forest classification model traversal.
19 # !
20 # ! The program trains the decision forest classification model on a training
21 # ! datasetFileName and prints the trained model by its depth-first traversing.
22 # !*****************************************************************************
23 
24 #
25 
26 
27 #
28 from __future__ import print_function
29 
30 from daal.algorithms import classifier
31 from daal.algorithms import decision_forest
32 import daal.algorithms.decision_forest.classification
33 import daal.algorithms.decision_forest.classification.training
34 
35 from daal.data_management import (
36  FileDataSource, HomogenNumericTable, MergedNumericTable, NumericTableIface, DataSourceIface, features
37 )
38 
39 # Input data set parameters
40 trainDatasetFileName = "../data/batch/df_classification_train.csv"
41 categoricalFeaturesIndices = [2]
42 nFeatures = 3 # Number of features in training and testing data sets
43 
44 # Decision forest parameters
45 nTrees = 2
46 minObservationsInLeafNode = 8
47 maxTreeDepth = 15
48 
49 nClasses = 5 # Number of classes
50 
51 
52 def trainModel():
53 
54  # Create Numeric Tables for training data and dependent variables
55  trainData, trainDependentVariable = loadData(trainDatasetFileName)
56 
57  # Create an algorithm object to train the decision forest classification model
58  algorithm = decision_forest.classification.training.Batch(nClasses)
59 
60  # Pass a training data set and dependent values to the algorithm
61  algorithm.input.set(classifier.training.data, trainData)
62  algorithm.input.set(classifier.training.labels, trainDependentVariable)
63 
64  algorithm.parameter.nTrees = nTrees
65  algorithm.parameter.featuresPerNode = nFeatures
66  algorithm.parameter.minObservationsInLeafNode = minObservationsInLeafNode
67  algorithm.parameter.maxTreeDepth = maxTreeDepth
68 
69  # Build the decision forest classification model and return the result
70  return algorithm.compute()
71 
72 
73 def loadData(fileName):
74 
75  # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
76  trainDataSource = FileDataSource(
77  fileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext
78  )
79 
80  # Create Numeric Tables for training data and dependent variables
81  data = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
82  dependentVar = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
83  mergedData = MergedNumericTable(data, dependentVar)
84 
85  # Retrieve the data from input file
86  trainDataSource.loadDataBlock(mergedData)
87 
88  dictionary = data.getDictionary()
89  for i in range(len(categoricalFeaturesIndices)):
90  dictionary[categoricalFeaturesIndices[i]].featureType = features.DAAL_CATEGORICAL
91 
92  return data, dependentVar
93 
94 
95 # Visitor class implementing NodeVisitor interface, prints out tree nodes of the model when it is called back by model traversal method
96 class PrintNodeVisitor(classifier.TreeNodeVisitor):
97 
98  def __init__(self):
99  super(PrintNodeVisitor, self).__init__()
100 
101  def onLeafNode(self, level, response):
102 
103  for i in range(level):
104  print(" ", end='')
105  print("Level {}, leaf node. Response value = {}".format(level, response))
106  return True
107 
108  def onSplitNode(self, level, featureIndex, featureValue):
109 
110  for i in range(level):
111  print(" ", end='')
112  print("Level {}, split node. Feature index = {}, feature value = {:.6g}".format(level, featureIndex, featureValue))
113  return True
114 
115 
116 def printModel(m):
117  visitor = PrintNodeVisitor()
118  print("Number of trees: {}".format(m.numberOfTrees()))
119  for i in range(m.numberOfTrees()):
120  print("Tree #{}".format(i))
121  m.traverseDF(i, visitor)
122 
123 
124 if __name__ == "__main__":
125 
126  trainingResult = trainModel()
127  printModel(trainingResult.get(classifier.training.model))

For more complete information about compiler optimizations, see our Optimization Notice.