Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  from Numeric import * 
  2  from cluster import * 
  3   
  4   
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree) 7 nElements = nNodes + 1 8 neworder = zeros(nElements,'d') 9 clusterids = range(nElements) 10 for i in range(nNodes): 11 i1 = tree[i].left 12 i2 = tree[i].right 13 if i1 < 0: 14 order1 = nodeorder[-i1-1] 15 count1 = nodecounts[-i1-1] 16 else: 17 order1 = order[i1] 18 count1 = 1 19 if i2 < 0: 20 order2 = nodeorder[-i2-1] 21 count2 = nodecounts[-i2-1] 22 else: 23 order2 = order[i2] 24 count2 = 1 25 # If order1 and order2 are equal, their order is determined by the order in which they were clustered 26 if i1 < i2: 27 if order1 < order2: 28 increase = count1 29 else: 30 increase = count2 31 for j in range(nElements): 32 clusterid = clusterids[j] 33 if clusterid==i1 and order1>=order2: neworder[j] += increase 34 if clusterid==i2 and order1<order2: neworder[j] += increase 35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 36 else: 37 if order1<=order2: 38 increase = count1 39 else: 40 increase = count2 41 for j in range(nElements): 42 clusterid = clusterids[j] 43 if clusterid==i1 and order1>order2: neworder[j] += increase 44 if clusterid==i2 and order1<=order2: neworder[j] += increase 45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 46 return argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0: 50 extension = ".gtr" 51 keyword = "GENE" 52 else: 53 extension = ".atr" 54 keyword = "ARRY" 55 nnodes = len(tree) 56 outputfile = open(jobname+extension, "w"); 57 nodeindex = 0 58 nodeID = [''] * (nnodes) 59 nodecounts = zeros(nnodes) 60 nodeorder = zeros(nnodes,'d') 61 nodedist = array([node.distance for node in tree]) 62 for nodeindex in range(nnodes): 63 min1 = tree[nodeindex].left 64 min2 = tree[nodeindex].right 65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 66 outputfile.write(nodeID[nodeindex]) 67 outputfile.write("\t") 68 if min1 < 0: 69 index1 = -min1-1 70 order1 = nodeorder[index1] 71 counts1 = nodecounts[index1] 72 outputfile.write(nodeID[index1]+"\t") 73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1]) 74 else: 75 order1 = order[min1] 76 counts1 = 1 77 outputfile.write("%s%dX\t" % (keyword, min1)) 78 if min2 < 0: 79 index2 = -min2-1 80 order2 = nodeorder[index2] 81 counts2 = nodecounts[index2] 82 outputfile.write(nodeID[index2]+"\t") 83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2]) 84 else: 85 order2 = order[min2]; 86 counts2 = 1; 87 outputfile.write("%s%dX\t" % (keyword, min2)) 88 outputfile.write(str(1.0-nodedist[nodeindex])) 89 outputfile.write("\n") 90 nodecounts[nodeindex] = counts1 + counts2 91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2) 92 outputfile.close() 93 # Now set up order based on the tree structure 94 index = _treesort(order, nodeorder, nodecounts, tree) 95 return index
96
97 -class Record:
98 """A Record stores the gene expression data and related information 99 contained in a data file following the file format defined for 100 Michael Eisen's Cluster/TreeView program. A Record 101 has the following members: 102 data: a matrix containing the gene expression data 103 mask: a matrix containing only 1's and 0's, denoting which values 104 are present (1) or missing (0). If all elements of mask are 105 one (no missing data), then mask is set to None. 106 geneid: a list containing a unique identifier for each gene 107 (e.g., ORF name) 108 genename: a list containing an additional description for each gene 109 (e.g., gene name) 110 gweight: the weight to be used for each gene when calculating the 111 distance 112 gorder: an array of real numbers indicating the preferred order of the 113 genes in the output file 114 expid: a list containing a unique identifier for each experimental 115 condition 116 eweight: the weight to be used for each experimental condition when 117 calculating the distance 118 eorder: an array of real numbers indication the preferred order in the 119 output file of the experimental conditions 120 uniqid: the string that was used instead of UNIQID in the input file."""
121 - def __init__(self, handle=None):
122 """Reads a data file in the format corresponding to Michael Eisen's 123 Cluster/TreeView program, and stores the data in a Record object""" 124 self.data = None 125 self.mask = None 126 self.geneid = None 127 self.genename = None 128 self.gweight = None 129 self.gorder = None 130 self.expid = None 131 self.eweight = None 132 self.eorder = None 133 self.uniqid = None 134 if not handle: return 135 lines = handle.readlines() 136 lines = [line.strip("\r\n").split("\t") for line in lines] 137 line = lines[0] 138 n = len(line) 139 self.uniqid = line[0] 140 self.expid = [] 141 cols = {0: "GENEID"} 142 for word in line[1:]: 143 if word=="NAME": 144 cols[line.index(word)] = word 145 self.genename = [] 146 elif word=="GWEIGHT": 147 cols[line.index(word)] = word 148 self.gweight = [] 149 elif word=="GORDER": 150 cols[line.index(word)] = word 151 self.gorder = [] 152 else: self.expid.append(word) 153 self.geneid = [] 154 self.data = [] 155 self.mask = [] 156 needmask = 0 157 for line in lines[1:]: 158 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n) 159 if line[0]=="EWEIGHT": 160 i = max(cols) + 1 161 self.eweight = map(float, line[i:]) 162 continue 163 if line[0]=="EORDER": 164 i = max(cols) + 1 165 self.eorder = map(float, line[i:]) 166 continue 167 rowdata = [] 168 rowmask = [] 169 n = len(line) 170 for i in range(n): 171 word = line[i] 172 if i in cols: 173 if cols[i]=="GENEID": self.geneid.append(word) 174 if cols[i]=="NAME": self.genename.append(word) 175 if cols[i]=="GWEIGHT": self.gweight.append(float(word)) 176 if cols[i]=="GORDER": self.gorder.append(float(word)) 177 continue 178 if not word: 179 rowdata.append(0.0) 180 rowmask.append(0) 181 needmask = 1 182 else: 183 rowdata.append(float(word)) 184 rowmask.append(1) 185 self.data.append(rowdata) 186 self.mask.append(rowmask) 187 self.data = array(self.data) 188 if needmask: self.mask = array(self.mask) 189 else: self.mask = None 190 if self.gweight: self.gweight = array(self.gweight) 191 if self.gorder: self.gorder = array(self.gorder)
192
193 - def treecluster(self, transpose=0, method='m', dist='e'):
194 if transpose==0: weight = self.eweight 195 else: weight = self.gweight 196 return treecluster(self.data, self.mask, weight, transpose, method, dist)
197
198 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
199 if transpose==0: weight = self.eweight 200 else: weight = self.gweight 201 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid) 202 return clusterid, error, nfound
203
204 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
205 if transpose==0: weight = self.eweight 206 else: weight = self.gweight 207 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist) 208 return clusterid, celldata
209
210 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
211 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose) 212 return cdata, cmask
213
214 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 215 transpose=0):
216 if transpose==0: weight = self.eweight 217 else: weight = self.gweight 218 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
219
220 - def distancematrix(self, transpose=0, dist='e'):
221 if transpose==0: weight = self.eweight 222 else: weight = self.gweight 223 return distancematrix(self.data, self.mask, weight, transpose, dist)
224
225 - def save(self, jobname, geneclusters=None, expclusters=None):
226 """save(jobname, geneclusters=None, expclusters=None) 227 saves the clustering results. The saved files follow the convention 228 for Java TreeView program, which can therefore be used to view the 229 clustering result. 230 Arguments: 231 jobname: The base name of the files to be saved. The filenames are 232 jobname.cdt, jobname.gtr, and jobname.atr for 233 hierarchical clustering, and jobname-K*.cdt, 234 jobname-K*.kgg, jobname-K*.kag for k-means clustering 235 results. 236 geneclusters=None: For hierarchical clustering results, 237 geneclusters is an (ngenes-1 x 2) array that describes 238 the hierarchical clustering result for genes. This array 239 can be calculated by the hierarchical clustering methods 240 implemented in treecluster. 241 For k-means clustering results, geneclusters is a vector 242 containing ngenes integers, describing to which cluster a 243 given gene belongs. This vector can be calculated by 244 kcluster. 245 expclusters=None: For hierarchical clustering results, expclusters 246 is an (nexps-1 x 2) array that describes the hierarchical 247 clustering result for experimental conditions. This array 248 can be calculated by the hierarchical clustering methods 249 implemented in treecluster. 250 For k-means clustering results, expclusters is a vector 251 containing nexps integers, describing to which cluster a 252 given experimental condition belongs. This vector can be 253 calculated by kcluster. 254 """ 255 (ngenes,nexps) = shape(self.data) 256 if self.gorder==None: gorder = arange(ngenes) 257 else: gorder = self.gorder 258 if self.eorder==None: eorder = arange(nexps) 259 else: eorder = self.eorder 260 if geneclusters and expclusters: 261 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters" 262 gid = 0 263 aid = 0 264 filename = jobname 265 postfix = "" 266 if type(geneclusters)==Tree: 267 # Hierarchical clustering result 268 geneindex = _savetree(jobname, geneclusters, gorder, 0) 269 gid = 1 270 elif geneclusters: 271 # k-means clustering result 272 filename = jobname + "_K" 273 k = max(geneclusters+1) 274 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 275 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 276 postfix = "_G%d" % k 277 else: 278 geneindex = argsort(gorder) 279 if type(expclusters)==Tree: 280 # Hierarchical clustering result 281 expindex = _savetree(jobname, expclusters, eorder, 1) 282 aid = 1 283 elif expclusters: 284 # k-means clustering result 285 filename = jobname + "_K" 286 k = max(expclusters+1) 287 kagfilename = "%s_K_A%d.kag" % (jobname, k) 288 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 289 postfix += "_A%d" % k 290 else: 291 expindex = argsort(eorder) 292 filename = filename + postfix 293 self._savedata(filename,gid,aid,geneindex,expindex)
294
295 - def _savekmeans(self, filename, clusterids, order, transpose):
296 if transpose==0: 297 label = self.uniqid 298 names = self.geneid 299 else: 300 label = "ARRAY" 301 names = self.expid 302 outputfile = open(filename, "w"); 303 if not outputfile: raise "Error: Unable to open output file" 304 outputfile.write(label + "\tGROUP\n") 305 index = argsort(order) 306 n = len(names) 307 sortedindex = zeros(n) 308 counter = 0 309 cluster = 0 310 while counter < n: 311 for j in index: 312 if clusterids[j]==cluster: 313 outputfile.write("%s\t%s\n" % (names[j], cluster)) 314 sortedindex[counter] = j 315 counter+=1 316 cluster+=1 317 outputfile.close(); 318 return sortedindex
319
320 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
321 if self.genename==None: genename = self.geneid 322 else: genename = self.genename 323 (ngenes, nexps) = shape(self.data) 324 outputfile = open(jobname+'.cdt', 'w') 325 if not outputfile: return "Error: Unable to open output file" 326 if self.mask: mask = self.mask 327 else: mask = ones((ngenes,nexps)) 328 if self.gweight: gweight = self.gweight 329 else: gweight = ones(ngenes) 330 if self.eweight: eweight = self.eweight 331 else: eweight = ones(nexps) 332 if gid: outputfile.write ('GID\t') 333 outputfile.write(self.uniqid) 334 outputfile.write('\tNAME\tGWEIGHT') 335 # Now add headers for data columns 336 for j in expindex: outputfile.write('\t%s' % self.expid[j]) 337 outputfile.write('\n') 338 if aid: 339 outputfile.write("AID") 340 if gid: outputfile.write('\t') 341 outputfile.write("\t\t") 342 for j in expindex: outputfile.write ('\tARRY%dX' % j) 343 outputfile.write('\n') 344 outputfile.write('EWEIGHT') 345 if gid: outputfile.write('\t') 346 outputfile.write('\t\t') 347 for j in expindex: outputfile.write('\t%f' % eweight[j]) 348 outputfile.write('\n') 349 for i in geneindex: 350 if gid: outputfile.write('GENE%dX\t' % i) 351 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i])) 352 for j in expindex: 353 outputfile.write('\t') 354 if mask[i][j]: outputfile.write(str(self.data[i][j])) 355 outputfile.write('\n') 356 outputfile.close()
357
358 -def read(handle):
359 return Record(handle)
360
361 -class DataFile(Record):
362 - def __init__(self, filename=None):
363 import warnings 364 warnings.warn("""The DataFile class has been deprecated. Please use the Record class instead. To load data from a file into a Record object, use "record = read(open('mydatafile.txt'))" """, DeprecationWarning) 365 if filename: handle = open(filename) 366 else: handle = None 367 Record.__init__(self, handle)
368