k means - TypeError when using a method from a class - python -


i have improved kmeans algorithm (kplusplus) builds on class kmeans. detk class inherited kplusplus.

the objective of kplusplus class find out optimal seeding finding kmeans centroids (source)

detk calculates gap statistic find optimal number of clusters. have found code here

# kmeans class  class kmeans(): def __init__(self, k, x=none, n=0):     self.k = k     if x == none:         if n == 0:             raise exception("if no data provided, \                              parameter n (number of points) needed")         else:             self.n = n             self.x = self._init_board_gauss(n, k)     else:         self.x = x         self.n = len(x)     self.mu = none     self.clusters = none     self.method = none  def _init_board_gauss(self, n, k):     n = float(n)/k     x = []     in range(k):         c = (random.uniform(-1,1), random.uniform(-1,1))         s = random.uniform(0.05,0.15)         x = []         while len(x) < n:             a,b = np.array([np.random.normal(c[0],s),np.random.normal(c[1],s)])             # continue drawing points distribution in range [-1,1]             if abs(a) , abs(b)<1:                 x.append([a,b])         x.extend(x)     x = np.array(x)[:n]     return x  def plot_board(self):     x = self.x     fig = plt.figure(figsize=(5,5))     plt.xlim(-1,1)     plt.ylim(-1,1)     if self.mu , self.clusters:         mu = self.mu         clus = self.clusters         k = self.k         m, clu in clus.items():             cs = cm.spectral(1.*m/self.k)             plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \                      markersize=12, color=cs)             plt.plot(zip(*clus[m])[0], zip(*clus[m])[1], '.', \                      markersize=8, color=cs, alpha=0.5)     else:         plt.plot(zip(*x)[0], zip(*x)[1], '.', alpha=0.5)     if self.method == '++':         tit = 'k-means++'     else:         tit = 'k-means random initialization'     pars = 'n=%s, k=%s' % (str(self.n), str(self.k))     plt.title('\n'.join([pars, tit]), fontsize=16)     plt.savefig('kpp_n%s_k%s.png' % (str(self.n), str(self.k)), \                 bbox_inches='tight', dpi=200)  def _cluster_points(self):     mu = self.mu     clusters  = {}     x in self.x:         bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \                          in enumerate(mu)], key=lambda t:t[1])[0]         try:             clusters[bestmukey].append(x)         except keyerror:             clusters[bestmukey] = [x]     self.clusters = clusters  def _reevaluate_centers(self):     clusters = self.clusters     newmu = []     keys = sorted(self.clusters.keys())     k in keys:         newmu.append(np.mean(clusters[k], axis = 0))     self.mu = newmu  def _has_converged(self):     k = len(self.oldmu)     return(set([tuple(a) in self.mu]) == \            set([tuple(a) in self.oldmu])\            , len(set([tuple(a) in self.mu])) == k)  def find_centers(self,k, method='random'):     self.method = method     x = self.x     k = self.k     self.oldmu = random.sample(x, k)     if method != '++':         # initialize k random centers         self.mu = random.sample(x, k)     while not self._has_converged():         self.oldmu = self.mu         # assign points in x clusters         self._cluster_points()         # reevaluate centers         self._reevaluate_centers() 

the kplusplus class inherits kmeans find optimal seeding

class kplusplus(kmeans): def _dist_from_centers(self):     cent = self.mu     x = self.x     d2 = np.array([min([np.linalg.norm(x-c)**2 c in cent]) x in x])     self.d2 = d2  def _choose_next_center(self):     self.probs = self.d2/self.d2.sum()     self.cumprobs = self.probs.cumsum()     r = random.random()     ind = np.where(self.cumprobs >= r)[0][0]     return(self.x[ind])  def init_centers(self,k):     self.k = k     self.mu = random.sample(self.x, 1)     while len(self.mu) < self.k:         self._dist_from_centers()         self.mu.append(self._choose_next_center())  def plot_init_centers(self):     x = self.x     fig = plt.figure(figsize=(5,5))     plt.xlim(-1,1)     plt.ylim(-1,1)     plt.plot(zip(*x)[0], zip(*x)[1], '.', alpha=0.5)     plt.plot(zip(*self.mu)[0], zip(*self.mu)[1], 'ro')     plt.savefig('kpp_init_n%s_k%s.png' % (str(self.n),str(self.k)), \                 bbox_inches='tight', dpi=200) 

the class detk inherits kplusplus find optmal number of clusters based on gap statistic

class detk(kplusplus): def fk(self, thisk, skm1=0):     x = self.x     nd = len(x[0])     = lambda k, nd: 1 - 3/(4*nd) if k == 2 else a(k-1, nd) + (1-a(k-1, nd))/6     self.find_centers(thisk, method='++')     mu, clusters = self.mu, self.clusters     sk = sum([np.linalg.norm(mu[i]-c)**2 \              in range(thisk) c in clusters[i]])     if thisk == 1:         fs = 1     elif skm1 == 0:         fs = 1     else:         fs = sk/(a(thisk,nd)*skm1)     return fs, sk    def _bounding_box(self):     x = self.x     xmin, xmax = min(x,key=lambda a:a[0])[0], max(x,key=lambda a:a[0])[0]     ymin, ymax = min(x,key=lambda a:a[1])[1], max(x,key=lambda a:a[1])[1]     return (xmin,xmax), (ymin,ymax)          def gap(self, thisk):     x = self.x     (xmin,xmax), (ymin,ymax) = self._bounding_box()     self.init_centers(thisk)     self.find_centers(thisk, method='++')     mu, clusters = self.mu, self.clusters     wk = np.log(sum([np.linalg.norm(mu[i]-c)**2/(2*len(c)) \                 in range(thisk) c in clusters[i]]))     # create b reference datasets     b = 10     bwkbs = zeros(b)     in range(b):         xb = []         n in range(len(x)):             xb.append([random.uniform(xmin,xmax), \                       random.uniform(ymin,ymax)])         xb = np.array(xb)         kb = detk(thisk, x=xb)         kb.init_centers(thisk)         kb.find_centers(thisk, method='++')         ms, cs = kb.mu, kb.clusters         bwkbs[i] = np.log(sum([np.linalg.norm(ms[j]-c)**2/(2*len(c)) \                           j in range(thisk) c in cs[j]]))     wkb = sum(bwkbs)/b     sk = np.sqrt(sum((bwkbs-wkb)**2)/float(b))*np.sqrt(1+1/b)     return wk, wkb, sk  def run(self, maxk, which='both'):     ks = range(1,maxk)     fs = zeros(len(ks))     wks,wkbs,sks = zeros(len(ks)+1),zeros(len(ks)+1),zeros(len(ks)+1)     # special case k=1     self.init_centers(1)     if == 'f':         fs[0], sk = self.fk(1)     elif == 'gap':         wks[0], wkbs[0], sks[0] = self.gap(1)     else:         fs[0], sk = self.fk(1)         wks[0], wkbs[0], sks[0] = self.gap(1)     # rest of ks     k in ks[1:]:         self.init_centers(k)         if == 'f':             fs[k-1], sk = self.fk(k, skm1=sk)         elif == 'gap':             wks[k-1], wkbs[k-1], sks[k-1] = self.gap(k)         else:             fs[k-1], sk = self.fk(k, skm1=sk)             wks[k-1], wkbs[k-1], sks[k-1] = self.gap(k)     if == 'f':         self.fs = fs     elif == 'gap':         g = []         in range(len(ks)):             g.append((wkbs-wks)[i] - ((wkbs-wks)[i+1]-sks[i+1]))         self.g = np.array(g)     else:         self.fs = fs         g = []         in range(len(ks)):             g.append((wkbs-wks)[i] - ((wkbs-wks)[i+1]-sks[i+1]))         self.g = np.array(g) 

when try run following program on given number of points (locarray)

locarray = np.array(locarraymaster[counter])  kmeanscluster = detk(2, x = locarray) kmeanscluster.run(5) noclusters[counter] = np.where(kmeanscluster.fs == min(kmeanscluster.fs))[0][0]+ 1   

it returns me following error

file "c:\users\anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile execfile(filename, namespace)  file "c:\users\anaconda2\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile exec(compile(scripttext, filename, 'exec'), glob, loc)  file "c:/users/documents/sumotraffic/kplusplus.py", line 355, in <module> kmeanscluster.run(5)  file "c:/users/documents/sumotraffic/kplusplus.py", line 217, in run wks[0], wkbs[0], sks[0] = self.gap(1)  file "c:/users/documents/sumotraffic/kplusplus.py", line 200, in gap j in range(thisk) c in cs[j]]))  typeerror: 'nonetype' object has no attribute '__getitem__' 

thanks help.

the error due failure of kmeans algorithm find cluster centres when number of clusters 1. hence cluster dictionary not created case. so, added line of code in class detk checks if type of cluster dictionary 'nonetype' , if returns true, recalculates cluster centres again.

class detk(kplusplus):     def fk(self, thisk, skm1=0):         x = self.x         nd = len(x[0])         = lambda k, nd: 1 - 3/(4*nd) if k == 2 else a(k-1, nd) + (1-a(k-1, nd))/6         self.find_centers(thisk, method='++') 
        while type(self.clusters) not dict:             self.find_centers(thisk, method = '++') 
        mu, clusters = self.mu, self.clusters         sk = sum([np.linalg.norm(mu[i]-c)**2 \                  in range(thisk) c in clusters[i]])         if thisk == 1:             fs = 1         elif skm1 == 0:             fs = 1         else:             fs = sk/(a(thisk,nd)*skm1)         return fs, sk  

Comments