i have improved kmeans algorithm (kplusplus) builds on class kmeans. detk class inherited kplusplus.
the objective of kplusplus class find out optimal seeding finding kmeans centroids (source)
detk calculates gap statistic find optimal number of clusters. have found code here
# kmeans class class kmeans(): def __init__(self, k, x=none, n=0): self.k = k if x == none: if n == 0: raise exception("if no data provided, \ parameter n (number of points) needed") else: self.n = n self.x = self._init_board_gauss(n, k) else: self.x = x self.n = len(x) self.mu = none self.clusters = none self.method = none def _init_board_gauss(self, n, k): n = float(n)/k x = [] in range(k): c = (random.uniform(-1,1), random.uniform(-1,1)) s = random.uniform(0.05,0.15) x = [] while len(x) < n: a,b = np.array([np.random.normal(c[0],s),np.random.normal(c[1],s)]) # continue drawing points distribution in range [-1,1] if abs(a) , abs(b)<1: x.append([a,b]) x.extend(x) x = np.array(x)[:n] return x def plot_board(self): x = self.x fig = plt.figure(figsize=(5,5)) plt.xlim(-1,1) plt.ylim(-1,1) if self.mu , self.clusters: mu = self.mu clus = self.clusters k = self.k m, clu in clus.items(): cs = cm.spectral(1.*m/self.k) plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \ markersize=12, color=cs) plt.plot(zip(*clus[m])[0], zip(*clus[m])[1], '.', \ markersize=8, color=cs, alpha=0.5) else: plt.plot(zip(*x)[0], zip(*x)[1], '.', alpha=0.5) if self.method == '++': tit = 'k-means++' else: tit = 'k-means random initialization' pars = 'n=%s, k=%s' % (str(self.n), str(self.k)) plt.title('\n'.join([pars, tit]), fontsize=16) plt.savefig('kpp_n%s_k%s.png' % (str(self.n), str(self.k)), \ bbox_inches='tight', dpi=200) def _cluster_points(self): mu = self.mu clusters = {} x in self.x: bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \ in enumerate(mu)], key=lambda t:t[1])[0] try: clusters[bestmukey].append(x) except keyerror: clusters[bestmukey] = [x] self.clusters = clusters def _reevaluate_centers(self): clusters = self.clusters newmu = [] keys = sorted(self.clusters.keys()) k in keys: newmu.append(np.mean(clusters[k], axis = 0)) self.mu = newmu def _has_converged(self): k = len(self.oldmu) return(set([tuple(a) in self.mu]) == \ set([tuple(a) in self.oldmu])\ , len(set([tuple(a) in self.mu])) == k) def find_centers(self,k, method='random'): self.method = method x = self.x k = self.k self.oldmu = random.sample(x, k) if method != '++': # initialize k random centers self.mu = random.sample(x, k) while not self._has_converged(): self.oldmu = self.mu # assign points in x clusters self._cluster_points() # reevaluate centers self._reevaluate_centers()
the kplusplus class inherits kmeans find optimal seeding
class kplusplus(kmeans): def _dist_from_centers(self): cent = self.mu x = self.x d2 = np.array([min([np.linalg.norm(x-c)**2 c in cent]) x in x]) self.d2 = d2 def _choose_next_center(self): self.probs = self.d2/self.d2.sum() self.cumprobs = self.probs.cumsum() r = random.random() ind = np.where(self.cumprobs >= r)[0][0] return(self.x[ind]) def init_centers(self,k): self.k = k self.mu = random.sample(self.x, 1) while len(self.mu) < self.k: self._dist_from_centers() self.mu.append(self._choose_next_center()) def plot_init_centers(self): x = self.x fig = plt.figure(figsize=(5,5)) plt.xlim(-1,1) plt.ylim(-1,1) plt.plot(zip(*x)[0], zip(*x)[1], '.', alpha=0.5) plt.plot(zip(*self.mu)[0], zip(*self.mu)[1], 'ro') plt.savefig('kpp_init_n%s_k%s.png' % (str(self.n),str(self.k)), \ bbox_inches='tight', dpi=200)
the class detk inherits kplusplus find optmal number of clusters based on gap statistic
class detk(kplusplus): def fk(self, thisk, skm1=0): x = self.x nd = len(x[0]) = lambda k, nd: 1 - 3/(4*nd) if k == 2 else a(k-1, nd) + (1-a(k-1, nd))/6 self.find_centers(thisk, method='++') mu, clusters = self.mu, self.clusters sk = sum([np.linalg.norm(mu[i]-c)**2 \ in range(thisk) c in clusters[i]]) if thisk == 1: fs = 1 elif skm1 == 0: fs = 1 else: fs = sk/(a(thisk,nd)*skm1) return fs, sk def _bounding_box(self): x = self.x xmin, xmax = min(x,key=lambda a:a[0])[0], max(x,key=lambda a:a[0])[0] ymin, ymax = min(x,key=lambda a:a[1])[1], max(x,key=lambda a:a[1])[1] return (xmin,xmax), (ymin,ymax) def gap(self, thisk): x = self.x (xmin,xmax), (ymin,ymax) = self._bounding_box() self.init_centers(thisk) self.find_centers(thisk, method='++') mu, clusters = self.mu, self.clusters wk = np.log(sum([np.linalg.norm(mu[i]-c)**2/(2*len(c)) \ in range(thisk) c in clusters[i]])) # create b reference datasets b = 10 bwkbs = zeros(b) in range(b): xb = [] n in range(len(x)): xb.append([random.uniform(xmin,xmax), \ random.uniform(ymin,ymax)]) xb = np.array(xb) kb = detk(thisk, x=xb) kb.init_centers(thisk) kb.find_centers(thisk, method='++') ms, cs = kb.mu, kb.clusters bwkbs[i] = np.log(sum([np.linalg.norm(ms[j]-c)**2/(2*len(c)) \ j in range(thisk) c in cs[j]])) wkb = sum(bwkbs)/b sk = np.sqrt(sum((bwkbs-wkb)**2)/float(b))*np.sqrt(1+1/b) return wk, wkb, sk def run(self, maxk, which='both'): ks = range(1,maxk) fs = zeros(len(ks)) wks,wkbs,sks = zeros(len(ks)+1),zeros(len(ks)+1),zeros(len(ks)+1) # special case k=1 self.init_centers(1) if == 'f': fs[0], sk = self.fk(1) elif == 'gap': wks[0], wkbs[0], sks[0] = self.gap(1) else: fs[0], sk = self.fk(1) wks[0], wkbs[0], sks[0] = self.gap(1) # rest of ks k in ks[1:]: self.init_centers(k) if == 'f': fs[k-1], sk = self.fk(k, skm1=sk) elif == 'gap': wks[k-1], wkbs[k-1], sks[k-1] = self.gap(k) else: fs[k-1], sk = self.fk(k, skm1=sk) wks[k-1], wkbs[k-1], sks[k-1] = self.gap(k) if == 'f': self.fs = fs elif == 'gap': g = [] in range(len(ks)): g.append((wkbs-wks)[i] - ((wkbs-wks)[i+1]-sks[i+1])) self.g = np.array(g) else: self.fs = fs g = [] in range(len(ks)): g.append((wkbs-wks)[i] - ((wkbs-wks)[i+1]-sks[i+1])) self.g = np.array(g)
when try run following program on given number of points (locarray
)
locarray = np.array(locarraymaster[counter]) kmeanscluster = detk(2, x = locarray) kmeanscluster.run(5) noclusters[counter] = np.where(kmeanscluster.fs == min(kmeanscluster.fs))[0][0]+ 1
it returns me following error
file "c:\users\anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile execfile(filename, namespace) file "c:\users\anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile exec(compile(scripttext, filename, 'exec'), glob, loc) file "c:/users/documents/sumotraffic/kplusplus.py", line 355, in <module> kmeanscluster.run(5) file "c:/users/documents/sumotraffic/kplusplus.py", line 217, in run wks[0], wkbs[0], sks[0] = self.gap(1) file "c:/users/documents/sumotraffic/kplusplus.py", line 200, in gap j in range(thisk) c in cs[j]])) typeerror: 'nonetype' object has no attribute '__getitem__'
thanks help.
the error due failure of kmeans algorithm find cluster centres when number of clusters 1. hence cluster dictionary not created case. so, added line of code in class detk
checks if type of cluster dictionary 'nonetype'
, if returns true
, recalculates cluster centres again.
class detk(kplusplus): def fk(self, thisk, skm1=0): x = self.x nd = len(x[0]) = lambda k, nd: 1 - 3/(4*nd) if k == 2 else a(k-1, nd) + (1-a(k-1, nd))/6 self.find_centers(thisk, method='++')
while type(self.clusters) not dict: self.find_centers(thisk, method = '++')
mu, clusters = self.mu, self.clusters sk = sum([np.linalg.norm(mu[i]-c)**2 \ in range(thisk) c in clusters[i]]) if thisk == 1: fs = 1 elif skm1 == 0: fs = 1 else: fs = sk/(a(thisk,nd)*skm1) return fs, sk
Comments
Post a Comment