Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  import numpy 
  2  from cluster import * 
  3   
  4   
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree) 7 nElements = nNodes + 1 8 neworder = numpy.zeros(nElements) 9 clusterids = range(nElements) 10 for i in range(nNodes): 11 i1 = tree[i].left 12 i2 = tree[i].right 13 if i1 < 0: 14 order1 = nodeorder[-i1-1] 15 count1 = nodecounts[-i1-1] 16 else: 17 order1 = order[i1] 18 count1 = 1 19 if i2 < 0: 20 order2 = nodeorder[-i2-1] 21 count2 = nodecounts[-i2-1] 22 else: 23 order2 = order[i2] 24 count2 = 1 25 # If order1 and order2 are equal, their order is determined by the order in which they were clustered 26 if i1 < i2: 27 if order1 < order2: 28 increase = count1 29 else: 30 increase = count2 31 for j in range(nElements): 32 clusterid = clusterids[j] 33 if clusterid==i1 and order1>=order2: neworder[j] += increase 34 if clusterid==i2 and order1<order2: neworder[j] += increase 35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 36 else: 37 if order1<=order2: 38 increase = count1 39 else: 40 increase = count2 41 for j in range(nElements): 42 clusterid = clusterids[j] 43 if clusterid==i1 and order1>order2: neworder[j] += increase 44 if clusterid==i2 and order1<=order2: neworder[j] += increase 45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 46 return numpy.argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0: 50 extension = ".gtr" 51 keyword = "GENE" 52 else: 53 extension = ".atr" 54 keyword = "ARRY" 55 nnodes = len(tree) 56 outputfile = open(jobname+extension, "w"); 57 nodeindex = 0 58 nodeID = [''] * (nnodes) 59 nodecounts = numpy.zeros(nnodes, int) 60 nodeorder = numpy.zeros(nnodes) 61 nodedist = numpy.array([node.distance for node in tree]) 62 for nodeindex in range(nnodes): 63 min1 = tree[nodeindex].left 64 min2 = tree[nodeindex].right 65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 66 outputfile.write(nodeID[nodeindex]) 67 outputfile.write("\t") 68 if min1 < 0: 69 index1 = -min1-1 70 order1 = nodeorder[index1] 71 counts1 = nodecounts[index1] 72 outputfile.write(nodeID[index1]+"\t") 73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1]) 74 else: 75 order1 = order[min1] 76 counts1 = 1 77 outputfile.write("%s%dX\t" % (keyword, min1)) 78 if min2 < 0: 79 index2 = -min2-1 80 order2 = nodeorder[index2] 81 counts2 = nodecounts[index2] 82 outputfile.write(nodeID[index2]+"\t") 83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2]) 84 else: 85 order2 = order[min2]; 86 counts2 = 1; 87 outputfile.write("%s%dX\t" % (keyword, min2)) 88 outputfile.write(str(1.0-nodedist[nodeindex])) 89 outputfile.write("\n") 90 nodecounts[nodeindex] = counts1 + counts2 91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2) 92 outputfile.close() 93 # Now set up order based on the tree structure 94 index = _treesort(order, nodeorder, nodecounts, tree) 95 return index
96
97 -class Record:
98 """A Record stores the gene expression data and related information 99 contained in a data file following the file format defined for 100 Michael Eisen's Cluster/TreeView program. A Record 101 has the following members: 102 data: a matrix containing the gene expression data 103 mask: a matrix containing only 1's and 0's, denoting which values 104 are present (1) or missing (0). If all elements of mask are 105 one (no missing data), then mask is set to None. 106 geneid: a list containing a unique identifier for each gene 107 (e.g., ORF name) 108 genename: a list containing an additional description for each gene 109 (e.g., gene name) 110 gweight: the weight to be used for each gene when calculating the 111 distance 112 gorder: an array of real numbers indicating the preferred order of the 113 genes in the output file 114 expid: a list containing a unique identifier for each experimental 115 condition 116 eweight: the weight to be used for each experimental condition when 117 calculating the distance 118 eorder: an array of real numbers indication the preferred order in the 119 output file of the experimental conditions 120 uniqid: the string that was used instead of UNIQID in the input file."""
121 - def __init__(self, handle=None):
122 """Reads a data file in the format corresponding to Michael Eisen's 123 Cluster/TreeView program, and stores the data in a Record object""" 124 self.data = None 125 self.mask = None 126 self.geneid = None 127 self.genename = None 128 self.gweight = None 129 self.gorder = None 130 self.expid = None 131 self.eweight = None 132 self.eorder = None 133 self.uniqid = None 134 if not handle: return 135 line = handle.readline().strip("\r\n").split("\t") 136 n = len(line) 137 self.uniqid = line[0] 138 self.expid = [] 139 cols = {0: "GENEID"} 140 for word in line[1:]: 141 if word=="NAME": 142 cols[line.index(word)] = word 143 self.genename = [] 144 elif word=="GWEIGHT": 145 cols[line.index(word)] = word 146 self.gweight = [] 147 elif word=="GORDER": 148 cols[line.index(word)] = word 149 self.gorder = [] 150 else: self.expid.append(word) 151 self.geneid = [] 152 self.data = [] 153 self.mask = [] 154 needmask = 0 155 for line in handle: 156 line = line.strip("\r\n").split("\t") 157 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n) 158 if line[0]=="EWEIGHT": 159 i = max(cols) + 1 160 self.eweight = map(float, line[i:]) 161 continue 162 if line[0]=="EORDER": 163 i = max(cols) + 1 164 self.eorder = map(float, line[i:]) 165 continue 166 rowdata = [] 167 rowmask = [] 168 n = len(line) 169 for i in range(n): 170 word = line[i] 171 if i in cols: 172 if cols[i]=="GENEID": self.geneid.append(word) 173 if cols[i]=="NAME": self.genename.append(word) 174 if cols[i]=="GWEIGHT": self.gweight.append(float(word)) 175 if cols[i]=="GORDER": self.gorder.append(float(word)) 176 continue 177 if not word: 178 rowdata.append(0.0) 179 rowmask.append(0) 180 needmask = 1 181 else: 182 rowdata.append(float(word)) 183 rowmask.append(1) 184 self.data.append(rowdata) 185 self.mask.append(rowmask) 186 self.data = numpy.array(self.data) 187 if needmask: self.mask = numpy.array(self.mask, int) 188 else: self.mask = None 189 if self.gweight: self.gweight = numpy.array(self.gweight) 190 if self.gorder: self.gorder = numpy.array(self.gorder)
191
192 - def treecluster(self, transpose=0, method='m', dist='e'):
193 if transpose==0: weight = self.eweight 194 else: weight = self.gweight 195 return treecluster(self.data, self.mask, weight, transpose, method, dist)
196
197 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
198 if transpose==0: weight = self.eweight 199 else: weight = self.gweight 200 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid) 201 return clusterid, error, nfound
202
203 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
204 if transpose==0: weight = self.eweight 205 else: weight = self.gweight 206 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist) 207 return clusterid, celldata
208
209 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
210 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose) 211 return cdata, cmask
212
213 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 214 transpose=0):
215 if transpose==0: weight = self.eweight 216 else: weight = self.gweight 217 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
218
219 - def distancematrix(self, transpose=0, dist='e'):
220 if transpose==0: weight = self.eweight 221 else: weight = self.gweight 222 return distancematrix(self.data, self.mask, weight, transpose, dist)
223
224 - def save(self, jobname, geneclusters=None, expclusters=None):
225 """save(jobname, geneclusters=None, expclusters=None) 226 saves the clustering results. The saved files follow the convention 227 for Java TreeView program, which can therefore be used to view the 228 clustering result. 229 Arguments: 230 jobname: The base name of the files to be saved. The filenames are 231 jobname.cdt, jobname.gtr, and jobname.atr for 232 hierarchical clustering, and jobname-K*.cdt, 233 jobname-K*.kgg, jobname-K*.kag for k-means clustering 234 results. 235 geneclusters=None: For hierarchical clustering results, 236 geneclusters is an (ngenes-1 x 2) array that describes 237 the hierarchical clustering result for genes. This array 238 can be calculated by the hierarchical clustering methods 239 implemented in treecluster. 240 For k-means clustering results, geneclusters is a vector 241 containing ngenes integers, describing to which cluster a 242 given gene belongs. This vector can be calculated by 243 kcluster. 244 expclusters=None: For hierarchical clustering results, expclusters 245 is an (nexps-1 x 2) array that describes the hierarchical 246 clustering result for experimental conditions. This array 247 can be calculated by the hierarchical clustering methods 248 implemented in treecluster. 249 For k-means clustering results, expclusters is a vector 250 containing nexps integers, describing to which cluster a 251 given experimental condition belongs. This vector can be 252 calculated by kcluster. 253 """ 254 (ngenes,nexps) = shape(self.data) 255 if self.gorder==None: gorder = numpy.arange(ngenes) 256 else: gorder = self.gorder 257 if self.eorder==None: eorder = numpy.arange(nexps) 258 else: eorder = self.eorder 259 if geneclusters and expclusters: 260 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters" 261 gid = 0 262 aid = 0 263 filename = jobname 264 postfix = "" 265 if type(geneclusters)==Tree: 266 # Hierarchical clustering result 267 geneindex = _savetree(jobname, geneclusters, gorder, 0) 268 gid = 1 269 elif geneclusters: 270 # k-means clustering result 271 filename = jobname + "_K" 272 k = max(geneclusters+1) 273 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 274 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 275 postfix = "_G%d" % k 276 else: 277 geneindex = numpy.argsort(gorder) 278 if type(expclusters)==Tree: 279 # Hierarchical clustering result 280 expindex = _savetree(jobname, expclusters, eorder, 1) 281 aid = 1 282 elif expclusters: 283 # k-means clustering result 284 filename = jobname + "_K" 285 k = max(expclusters+1) 286 kagfilename = "%s_K_A%d.kag" % (jobname, k) 287 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 288 postfix += "_A%d" % k 289 else: 290 expindex = numpy.argsort(eorder) 291 filename = filename + postfix 292 self._savedata(filename,gid,aid,geneindex,expindex)
293
294 - def _savekmeans(self, filename, clusterids, order, transpose):
295 if transpose==0: 296 label = self.uniqid 297 names = self.geneid 298 else: 299 label = "ARRAY" 300 names = self.expid 301 outputfile = open(filename, "w"); 302 if not outputfile: raise "Error: Unable to open output file" 303 outputfile.write(label + "\tGROUP\n") 304 index = numpy.argsort(order) 305 n = len(names) 306 sortedindex = numpy.zeros(n, int) 307 counter = 0 308 cluster = 0 309 while counter < n: 310 for j in index: 311 if clusterids[j]==cluster: 312 outputfile.write("%s\t%s\n" % (names[j], cluster)) 313 sortedindex[counter] = j 314 counter+=1 315 cluster+=1 316 outputfile.close(); 317 return sortedindex
318
319 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
320 if self.genename==None: genename = self.geneid 321 else: genename = self.genename 322 (ngenes, nexps) = numpy.shape(self.data) 323 outputfile = open(jobname+'.cdt', 'w') 324 if not outputfile: return "Error: Unable to open output file" 325 if self.mask: mask = self.mask 326 else: mask = numpy.ones((ngenes,nexps), int) 327 if self.gweight: gweight = self.gweight 328 else: gweight = numpy.ones(ngenes) 329 if self.eweight: eweight = self.eweight 330 else: eweight = numpy.ones(nexps) 331 if gid: outputfile.write ('GID\t') 332 outputfile.write(self.uniqid) 333 outputfile.write('\tNAME\tGWEIGHT') 334 # Now add headers for data columns 335 for j in expindex: outputfile.write('\t%s' % self.expid[j]) 336 outputfile.write('\n') 337 if aid: 338 outputfile.write("AID") 339 if gid: outputfile.write('\t') 340 outputfile.write("\t\t") 341 for j in expindex: outputfile.write ('\tARRY%dX' % j) 342 outputfile.write('\n') 343 outputfile.write('EWEIGHT') 344 if gid: outputfile.write('\t') 345 outputfile.write('\t\t') 346 for j in expindex: outputfile.write('\t%f' % eweight[j]) 347 outputfile.write('\n') 348 for i in geneindex: 349 if gid: outputfile.write('GENE%dX\t' % i) 350 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i])) 351 for j in expindex: 352 outputfile.write('\t') 353 if mask[i][j]: outputfile.write(str(self.data[i][j])) 354 outputfile.write('\n') 355 outputfile.close()
356
357 -def read(handle):
358 return Record(handle)
359