Python* API Reference for Intel® Data Analytics Acceleration Library 2018 Update 2

kmeans_init_dense_batch.py

1 # file: kmeans_init_dense_batch.py
2 #===============================================================================
3 # Copyright 2014-2018 Intel Corporation
4 # All Rights Reserved.
5 #
6 # If this software was obtained under the Intel Simplified Software License,
7 # the following terms apply:
8 #
9 # The source code, information and material ("Material") contained herein is
10 # owned by Intel Corporation or its suppliers or licensors, and title to such
11 # Material remains with Intel Corporation or its suppliers or licensors. The
12 # Material contains proprietary information of Intel or its suppliers and
13 # licensors. The Material is protected by worldwide copyright laws and treaty
14 # provisions. No part of the Material may be used, copied, reproduced,
15 # modified, published, uploaded, posted, transmitted, distributed or disclosed
16 # in any way without Intel's prior express written permission. No license under
17 # any patent, copyright or other intellectual property rights in the Material
18 # is granted to or conferred upon you, either expressly, by implication,
19 # inducement, estoppel or otherwise. Any license under such intellectual
20 # property rights must be express and approved by Intel in writing.
21 #
22 # Unless otherwise agreed by Intel in writing, you may not remove or alter this
23 # notice or any other notice embedded in Materials by Intel or Intel's
24 # suppliers or licensors in any way.
25 #
26 #
27 # If this software was obtained under the Apache License, Version 2.0 (the
28 # "License"), the following terms apply:
29 #
30 # You may not use this file except in compliance with the License. You may
31 # obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
32 #
33 #
34 # Unless required by applicable law or agreed to in writing, software
35 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
36 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 #
38 # See the License for the specific language governing permissions and
39 # limitations under the License.
40 #===============================================================================
41 
42 #
43 # ! Content:
44 # ! Python example of dense K-Means clustering with different initialization methods
45 # ! in the batch processing mode
46 # !*****************************************************************************
47 
48 #
49 
50 
51 #
52 
53 import os
54 import numpy as np
55 from daal.algorithms import kmeans
56 import daal.algorithms.kmeans.init
57 from daal.data_management import HomogenNumericTable, FileDataSource, DataSource, BlockDescriptor, readOnly
58 
59 DAAL_PREFIX = os.path.join('..', 'data')
60 # Input data set
61 datasetFileName = os.path.join(DAAL_PREFIX, 'batch', 'kmeans_init_dense.csv')
62 
63 # K-Means algorithm parameters
64 nMaxIterations = 1000
65 cAccuracyThreshold = 0.01
66 nClusters = 20
67 
68 def getSingleValue(pTbl, ntype):
69  block = BlockDescriptor(ntype=ntype)
70  pTbl.getBlockOfRows(0, 1, readOnly, block)
71  value = block.getArray().flatten()[0]
72  pTbl.releaseBlockOfRows(block)
73  return value
74 
75 
76 def runKmeans(inputData, nClusters, method, methodName, oversamplingFactor = -1.0):
77  # Get initial clusters for the K-Means algorithm
78  init = kmeans.init.Batch(nClusters, fptype=np.float32, method=method)
79  init.input.set(kmeans.init.data, inputData)
80  if oversamplingFactor > 0:
81  init.parameter.oversamplingFactor = oversamplingFactor
82  if method == kmeans.init.parallelPlusDense:
83  print("K-means init parameters: method = " + methodName + ", oversamplingFactor = "
84  + str(init.parameter.oversamplingFactor) + ", nRounds = " + str(init.parameter.nRounds))
85  else:
86  print("K-means init parameters: method = " + methodName)
87 
88  centroids = init.compute().get(kmeans.init.centroids)
89 
90  # Create an algorithm object for the K-Means algorithm
91  algorithm = kmeans.Batch(nClusters, nMaxIterations)
92 
93  algorithm.input.set(kmeans.data, inputData)
94  algorithm.input.set(kmeans.inputCentroids, centroids)
95  algorithm.parameter.accuracyThreshold = cAccuracyThreshold
96  print("K-means algorithm parameters: maxIterations = " + str(algorithm.parameter.maxIterations)
97  + ", accuracyThreshold = " + str(algorithm.parameter.accuracyThreshold))
98  res = algorithm.compute()
99 
100  # Print the results
101  goalFunc = getSingleValue(res.get(kmeans.objectiveFunction), ntype=np.float32)
102  nIterations = getSingleValue(res.get(kmeans.nIterations), ntype=np.intc)
103  print("K-means algorithm results: Objective function value = " + str(goalFunc*1e-6)
104  + "*1E+6, number of iterations = " + str(nIterations) + "\n")
105 
106 
107 if __name__ == "__main__":
108  # Initialize FileDataSource to retrieve the input data from a .csv file
109  inputData = HomogenNumericTable(ntype=np.float32)
110  dataSource = FileDataSource(datasetFileName,
111  DataSource.notAllocateNumericTable,
112  DataSource.doDictionaryFromContext)
113 
114  # Retrieve the data from the input file
115  dataSource.loadDataBlock(inputData)
116 
117  runKmeans(inputData, nClusters, kmeans.init.deterministicDense, "deterministicDense")
118  runKmeans(inputData, nClusters, kmeans.init.randomDense, "randomDense")
119  runKmeans(inputData, nClusters, kmeans.init.plusPlusDense, "plusPlusDense")
120  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 0.5)
121  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 2.0)

For more complete information about compiler optimizations, see our Optimization Notice.