Python* API Reference for Intel® Data Analytics Acceleration Library 2018 Update 3

kmeans_dense_distr.py

1 # file: kmeans_dense_distr.py
2 #===============================================================================
3 # Copyright 2014-2018 Intel Corporation.
4 #
5 # This software and the related documents are Intel copyrighted materials, and
6 # your use of them is governed by the express license under which they were
7 # provided to you (License). Unless the License provides otherwise, you may not
8 # use, modify, copy, publish, distribute, disclose or transmit this software or
9 # the related documents without Intel's prior written permission.
10 #
11 # This software and the related documents are provided as is, with no express
12 # or implied warranties, other than those that are expressly stated in the
13 # License.
14 #===============================================================================
15 
16 
17 
18 
19 import os
20 import sys
21 
22 import daal.algorithms.kmeans as kmeans
23 import daal.algorithms.kmeans.init as init
24 from daal import step1Local, step2Master
25 from daal.data_management import FileDataSource, DataSourceIface
26 
27 utils_folder = os.path.realpath(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
28 if utils_folder not in sys.path:
29  sys.path.insert(0, utils_folder)
30 from utils import printNumericTable
31 
32 DAAL_PREFIX = os.path.join('..', 'data')
33 
34 dataFileNames = [
35  os.path.join(DAAL_PREFIX, 'distributed', 'kmeans_dense_1.csv'),
36  os.path.join(DAAL_PREFIX, 'distributed', 'kmeans_dense_2.csv'),
37  os.path.join(DAAL_PREFIX, 'distributed', 'kmeans_dense_3.csv'),
38  os.path.join(DAAL_PREFIX, 'distributed', 'kmeans_dense_4.csv')
39 ]
40 
41 nClusters = 20
42 nIterations = 5
43 nBlocks = 4
44 nVectorsInBlock = 2500
45 
46 dataTable = [0] * nBlocks
47 
48 if __name__ == "__main__":
49 
50  masterAlgorithm = kmeans.Distributed(step2Master, nClusters, method=kmeans.lloydDense)
51 
52  centroids = None
53  assignments = [0] * nBlocks
54 
55  masterInitAlgorithm = init.Distributed(step2Master, nClusters, method=init.randomDense)
56  for i in range(nBlocks):
57  # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
58  dataSource = FileDataSource(
59  dataFileNames[i], DataSourceIface.doAllocateNumericTable,
60  DataSourceIface.doDictionaryFromContext
61  )
62 
63  # Retrieve the data from the input file
64  dataSource.loadDataBlock()
65 
66  dataTable[i] = dataSource.getNumericTable()
67 
68  # Create an algorithm object for the K-Means algorithm
69  localInit = init.Distributed(step1Local, nClusters, nBlocks * nVectorsInBlock, i * nVectorsInBlock, method=init.randomDense)
70 
71  localInit.input.set(init.data, dataTable[i])
72  res = localInit.compute()
73  masterInitAlgorithm.input.add(init.partialResults, res)
74 
75  masterInitAlgorithm.compute()
76  res = masterInitAlgorithm.finalizeCompute()
77  centroids = res.get(init.centroids)
78 
79  for it in range(nIterations):
80  for i in range(nBlocks):
81  # Create an algorithm object for the K-Means algorithm
82  localAlgorithm = kmeans.Distributed(step1Local, nClusters, it == nIterations, method=kmeans.lloydDense)
83 
84  # Set the input data to the algorithm
85  localAlgorithm.input.set(kmeans.data, dataTable[i])
86  localAlgorithm.input.set(kmeans.inputCentroids, centroids)
87 
88  pres = localAlgorithm.compute()
89 
90  masterAlgorithm.input.add(kmeans.partialResults, pres)
91 
92  masterAlgorithm.compute()
93  result = masterAlgorithm.finalizeCompute()
94 
95  centroids = result.get(kmeans.centroids)
96  goalFunction = result.get(kmeans.goalFunction)
97 
98  for i in range(nBlocks):
99  # Create an algorithm object for the K-Means algorithm
100  localAlgorithm = kmeans.Batch(nClusters, 0, method=kmeans.lloydDense)
101 
102  # Set the input data to the algorithm
103  localAlgorithm.input.set(kmeans.data, dataTable[i])
104  localAlgorithm.input.set(kmeans.inputCentroids, centroids)
105 
106  res = localAlgorithm.compute()
107 
108  assignments[i] = res.get(kmeans.assignments)
109 
110  # Print the clusterization results
111  printNumericTable(assignments[0], "First 10 cluster assignments from 1st node:", 10)
112  printNumericTable(centroids, "First 10 dimensions of centroids:", 20, 10)
113  printNumericTable(goalFunction, "Goal function value:")

For more complete information about compiler optimizations, see our Optimization Notice.