Python* API Reference for Intel® Data Analytics Acceleration Library 2019 Update 4

kmeans_init_dense_batch.py

1 # file: kmeans_init_dense_batch.py
2 #===============================================================================
3 # Copyright 2014-2019 Intel Corporation.
4 #
5 # This software and the related documents are Intel copyrighted materials, and
6 # your use of them is governed by the express license under which they were
7 # provided to you (License). Unless the License provides otherwise, you may not
8 # use, modify, copy, publish, distribute, disclose or transmit this software or
9 # the related documents without Intel's prior written permission.
10 #
11 # This software and the related documents are provided as is, with no express
12 # or implied warranties, other than those that are expressly stated in the
13 # License.
14 #===============================================================================
15 
16 #
17 # ! Content:
18 # ! Python example of dense K-Means clustering with different initialization methods
19 # ! in the batch processing mode
20 # !*****************************************************************************
21 
22 #
23 
24 
25 #
26 
27 import os
28 import numpy as np
29 from daal.algorithms import kmeans
30 import daal.algorithms.kmeans.init
31 from daal.data_management import HomogenNumericTable, FileDataSource, DataSource, BlockDescriptor, readOnly
32 
33 DAAL_PREFIX = os.path.join('..', 'data')
34 # Input data set
35 datasetFileName = os.path.join(DAAL_PREFIX, 'batch', 'kmeans_init_dense.csv')
36 
37 # K-Means algorithm parameters
38 nMaxIterations = 1000
39 cAccuracyThreshold = 0.01
40 nClusters = 20
41 
42 def getSingleValue(pTbl, ntype):
43  block = BlockDescriptor(ntype=ntype)
44  pTbl.getBlockOfRows(0, 1, readOnly, block)
45  value = block.getArray().flatten()[0]
46  pTbl.releaseBlockOfRows(block)
47  return value
48 
49 
50 def runKmeans(inputData, nClusters, method, methodName, oversamplingFactor = -1.0):
51  # Get initial clusters for the K-Means algorithm
52  init = kmeans.init.Batch(nClusters, fptype=np.float32, method=method)
53  init.input.set(kmeans.init.data, inputData)
54  if oversamplingFactor > 0:
55  init.parameter.oversamplingFactor = oversamplingFactor
56  if method == kmeans.init.parallelPlusDense:
57  print("K-means init parameters: method = " + methodName + ", oversamplingFactor = "
58  + str(init.parameter.oversamplingFactor) + ", nRounds = " + str(init.parameter.nRounds))
59  else:
60  print("K-means init parameters: method = " + methodName)
61 
62  centroids = init.compute().get(kmeans.init.centroids)
63 
64  # Create an algorithm object for the K-Means algorithm
65  algorithm = kmeans.Batch(nClusters, nMaxIterations)
66 
67  algorithm.input.set(kmeans.data, inputData)
68  algorithm.input.set(kmeans.inputCentroids, centroids)
69  algorithm.parameter.accuracyThreshold = cAccuracyThreshold
70  print("K-means algorithm parameters: maxIterations = " + str(algorithm.parameter.maxIterations)
71  + ", accuracyThreshold = " + str(algorithm.parameter.accuracyThreshold))
72  res = algorithm.compute()
73 
74  # Print the results
75  goalFunc = getSingleValue(res.get(kmeans.objectiveFunction), ntype=np.float32)
76  nIterations = getSingleValue(res.get(kmeans.nIterations), ntype=np.intc)
77  print("K-means algorithm results: Objective function value = " + str(goalFunc*1e-6)
78  + "*1E+6, number of iterations = " + str(nIterations) + "\n")
79 
80 
81 if __name__ == "__main__":
82  # Initialize FileDataSource to retrieve the input data from a .csv file
83  inputData = HomogenNumericTable(ntype=np.float32)
84  dataSource = FileDataSource(datasetFileName,
85  DataSource.notAllocateNumericTable,
86  DataSource.doDictionaryFromContext)
87 
88  # Retrieve the data from the input file
89  dataSource.loadDataBlock(inputData)
90 
91  runKmeans(inputData, nClusters, kmeans.init.deterministicDense, "deterministicDense")
92  runKmeans(inputData, nClusters, kmeans.init.randomDense, "randomDense")
93  runKmeans(inputData, nClusters, kmeans.init.plusPlusDense, "plusPlusDense")
94  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 0.5)
95  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 2.0)

For more complete information about compiler optimizations, see our Optimization Notice.