1 import numpy
2 from cluster import *
3
4
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree)
7 nElements = nNodes + 1
8 neworder = numpy.zeros(nElements)
9 clusterids = range(nElements)
10 for i in range(nNodes):
11 i1 = tree[i].left
12 i2 = tree[i].right
13 if i1 < 0:
14 order1 = nodeorder[-i1-1]
15 count1 = nodecounts[-i1-1]
16 else:
17 order1 = order[i1]
18 count1 = 1
19 if i2 < 0:
20 order2 = nodeorder[-i2-1]
21 count2 = nodecounts[-i2-1]
22 else:
23 order2 = order[i2]
24 count2 = 1
25
26 if i1 < i2:
27 if order1 < order2:
28 increase = count1
29 else:
30 increase = count2
31 for j in range(nElements):
32 clusterid = clusterids[j]
33 if clusterid==i1 and order1>=order2: neworder[j] += increase
34 if clusterid==i2 and order1<order2: neworder[j] += increase
35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
36 else:
37 if order1<=order2:
38 increase = count1
39 else:
40 increase = count2
41 for j in range(nElements):
42 clusterid = clusterids[j]
43 if clusterid==i1 and order1>order2: neworder[j] += increase
44 if clusterid==i2 and order1<=order2: neworder[j] += increase
45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
46 return numpy.argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0:
50 extension = ".gtr"
51 keyword = "GENE"
52 else:
53 extension = ".atr"
54 keyword = "ARRY"
55 nnodes = len(tree)
56 outputfile = open(jobname+extension, "w");
57 nodeindex = 0
58 nodeID = [''] * (nnodes)
59 nodecounts = numpy.zeros(nnodes, int)
60 nodeorder = numpy.zeros(nnodes)
61 nodedist = numpy.array([node.distance for node in tree])
62 for nodeindex in range(nnodes):
63 min1 = tree[nodeindex].left
64 min2 = tree[nodeindex].right
65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
66 outputfile.write(nodeID[nodeindex])
67 outputfile.write("\t")
68 if min1 < 0:
69 index1 = -min1-1
70 order1 = nodeorder[index1]
71 counts1 = nodecounts[index1]
72 outputfile.write(nodeID[index1]+"\t")
73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
74 else:
75 order1 = order[min1]
76 counts1 = 1
77 outputfile.write("%s%dX\t" % (keyword, min1))
78 if min2 < 0:
79 index2 = -min2-1
80 order2 = nodeorder[index2]
81 counts2 = nodecounts[index2]
82 outputfile.write(nodeID[index2]+"\t")
83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
84 else:
85 order2 = order[min2];
86 counts2 = 1;
87 outputfile.write("%s%dX\t" % (keyword, min2))
88 outputfile.write(str(1.0-nodedist[nodeindex]))
89 outputfile.write("\n")
90 nodecounts[nodeindex] = counts1 + counts2
91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2)
92 outputfile.close()
93
94 index = _treesort(order, nodeorder, nodecounts, tree)
95 return index
96
98 """A Record stores the gene expression data and related information
99 contained in a data file following the file format defined for
100 Michael Eisen's Cluster/TreeView program. A Record
101 has the following members:
102 data: a matrix containing the gene expression data
103 mask: a matrix containing only 1's and 0's, denoting which values
104 are present (1) or missing (0). If all elements of mask are
105 one (no missing data), then mask is set to None.
106 geneid: a list containing a unique identifier for each gene
107 (e.g., ORF name)
108 genename: a list containing an additional description for each gene
109 (e.g., gene name)
110 gweight: the weight to be used for each gene when calculating the
111 distance
112 gorder: an array of real numbers indicating the preferred order of the
113 genes in the output file
114 expid: a list containing a unique identifier for each experimental
115 condition
116 eweight: the weight to be used for each experimental condition when
117 calculating the distance
118 eorder: an array of real numbers indication the preferred order in the
119 output file of the experimental conditions
120 uniqid: the string that was used instead of UNIQID in the input file."""
122 """Reads a data file in the format corresponding to Michael Eisen's
123 Cluster/TreeView program, and stores the data in a Record object"""
124 self.data = None
125 self.mask = None
126 self.geneid = None
127 self.genename = None
128 self.gweight = None
129 self.gorder = None
130 self.expid = None
131 self.eweight = None
132 self.eorder = None
133 self.uniqid = None
134 if not handle: return
135 line = handle.readline().strip("\r\n").split("\t")
136 n = len(line)
137 self.uniqid = line[0]
138 self.expid = []
139 cols = {0: "GENEID"}
140 for word in line[1:]:
141 if word=="NAME":
142 cols[line.index(word)] = word
143 self.genename = []
144 elif word=="GWEIGHT":
145 cols[line.index(word)] = word
146 self.gweight = []
147 elif word=="GORDER":
148 cols[line.index(word)] = word
149 self.gorder = []
150 else: self.expid.append(word)
151 self.geneid = []
152 self.data = []
153 self.mask = []
154 needmask = 0
155 for line in handle:
156 line = line.strip("\r\n").split("\t")
157 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n)
158 if line[0]=="EWEIGHT":
159 i = max(cols) + 1
160 self.eweight = map(float, line[i:])
161 continue
162 if line[0]=="EORDER":
163 i = max(cols) + 1
164 self.eorder = map(float, line[i:])
165 continue
166 rowdata = []
167 rowmask = []
168 n = len(line)
169 for i in range(n):
170 word = line[i]
171 if i in cols:
172 if cols[i]=="GENEID": self.geneid.append(word)
173 if cols[i]=="NAME": self.genename.append(word)
174 if cols[i]=="GWEIGHT": self.gweight.append(float(word))
175 if cols[i]=="GORDER": self.gorder.append(float(word))
176 continue
177 if not word:
178 rowdata.append(0.0)
179 rowmask.append(0)
180 needmask = 1
181 else:
182 rowdata.append(float(word))
183 rowmask.append(1)
184 self.data.append(rowdata)
185 self.mask.append(rowmask)
186 self.data = numpy.array(self.data)
187 if needmask: self.mask = numpy.array(self.mask, int)
188 else: self.mask = None
189 if self.gweight: self.gweight = numpy.array(self.gweight)
190 if self.gorder: self.gorder = numpy.array(self.gorder)
191
192 - def treecluster(self, transpose=0, method='m', dist='e'):
193 if transpose==0: weight = self.eweight
194 else: weight = self.gweight
195 return treecluster(self.data, self.mask, weight, transpose, method, dist)
196
197 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
198 if transpose==0: weight = self.eweight
199 else: weight = self.gweight
200 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid)
201 return clusterid, error, nfound
202
203 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
204 if transpose==0: weight = self.eweight
205 else: weight = self.gweight
206 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist)
207 return clusterid, celldata
208
210 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose)
211 return cdata, cmask
212
213 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
214 transpose=0):
215 if transpose==0: weight = self.eweight
216 else: weight = self.gweight
217 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
218
220 if transpose==0: weight = self.eweight
221 else: weight = self.gweight
222 return distancematrix(self.data, self.mask, weight, transpose, dist)
223
224 - def save(self, jobname, geneclusters=None, expclusters=None):
225 """save(jobname, geneclusters=None, expclusters=None)
226 saves the clustering results. The saved files follow the convention
227 for Java TreeView program, which can therefore be used to view the
228 clustering result.
229 Arguments:
230 jobname: The base name of the files to be saved. The filenames are
231 jobname.cdt, jobname.gtr, and jobname.atr for
232 hierarchical clustering, and jobname-K*.cdt,
233 jobname-K*.kgg, jobname-K*.kag for k-means clustering
234 results.
235 geneclusters=None: For hierarchical clustering results,
236 geneclusters is an (ngenes-1 x 2) array that describes
237 the hierarchical clustering result for genes. This array
238 can be calculated by the hierarchical clustering methods
239 implemented in treecluster.
240 For k-means clustering results, geneclusters is a vector
241 containing ngenes integers, describing to which cluster a
242 given gene belongs. This vector can be calculated by
243 kcluster.
244 expclusters=None: For hierarchical clustering results, expclusters
245 is an (nexps-1 x 2) array that describes the hierarchical
246 clustering result for experimental conditions. This array
247 can be calculated by the hierarchical clustering methods
248 implemented in treecluster.
249 For k-means clustering results, expclusters is a vector
250 containing nexps integers, describing to which cluster a
251 given experimental condition belongs. This vector can be
252 calculated by kcluster.
253 """
254 (ngenes,nexps) = shape(self.data)
255 if self.gorder==None: gorder = numpy.arange(ngenes)
256 else: gorder = self.gorder
257 if self.eorder==None: eorder = numpy.arange(nexps)
258 else: eorder = self.eorder
259 if geneclusters and expclusters:
260 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters"
261 gid = 0
262 aid = 0
263 filename = jobname
264 postfix = ""
265 if type(geneclusters)==Tree:
266
267 geneindex = _savetree(jobname, geneclusters, gorder, 0)
268 gid = 1
269 elif geneclusters:
270
271 filename = jobname + "_K"
272 k = max(geneclusters+1)
273 kggfilename = "%s_K_G%d.kgg" % (jobname, k)
274 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
275 postfix = "_G%d" % k
276 else:
277 geneindex = numpy.argsort(gorder)
278 if type(expclusters)==Tree:
279
280 expindex = _savetree(jobname, expclusters, eorder, 1)
281 aid = 1
282 elif expclusters:
283
284 filename = jobname + "_K"
285 k = max(expclusters+1)
286 kagfilename = "%s_K_A%d.kag" % (jobname, k)
287 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
288 postfix += "_A%d" % k
289 else:
290 expindex = numpy.argsort(eorder)
291 filename = filename + postfix
292 self._savedata(filename,gid,aid,geneindex,expindex)
293
294 - def _savekmeans(self, filename, clusterids, order, transpose):
295 if transpose==0:
296 label = self.uniqid
297 names = self.geneid
298 else:
299 label = "ARRAY"
300 names = self.expid
301 outputfile = open(filename, "w");
302 if not outputfile: raise "Error: Unable to open output file"
303 outputfile.write(label + "\tGROUP\n")
304 index = numpy.argsort(order)
305 n = len(names)
306 sortedindex = numpy.zeros(n, int)
307 counter = 0
308 cluster = 0
309 while counter < n:
310 for j in index:
311 if clusterids[j]==cluster:
312 outputfile.write("%s\t%s\n" % (names[j], cluster))
313 sortedindex[counter] = j
314 counter+=1
315 cluster+=1
316 outputfile.close();
317 return sortedindex
318
319 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
320 if self.genename==None: genename = self.geneid
321 else: genename = self.genename
322 (ngenes, nexps) = numpy.shape(self.data)
323 outputfile = open(jobname+'.cdt', 'w')
324 if not outputfile: return "Error: Unable to open output file"
325 if self.mask: mask = self.mask
326 else: mask = numpy.ones((ngenes,nexps), int)
327 if self.gweight: gweight = self.gweight
328 else: gweight = numpy.ones(ngenes)
329 if self.eweight: eweight = self.eweight
330 else: eweight = numpy.ones(nexps)
331 if gid: outputfile.write ('GID\t')
332 outputfile.write(self.uniqid)
333 outputfile.write('\tNAME\tGWEIGHT')
334
335 for j in expindex: outputfile.write('\t%s' % self.expid[j])
336 outputfile.write('\n')
337 if aid:
338 outputfile.write("AID")
339 if gid: outputfile.write('\t')
340 outputfile.write("\t\t")
341 for j in expindex: outputfile.write ('\tARRY%dX' % j)
342 outputfile.write('\n')
343 outputfile.write('EWEIGHT')
344 if gid: outputfile.write('\t')
345 outputfile.write('\t\t')
346 for j in expindex: outputfile.write('\t%f' % eweight[j])
347 outputfile.write('\n')
348 for i in geneindex:
349 if gid: outputfile.write('GENE%dX\t' % i)
350 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]))
351 for j in expindex:
352 outputfile.write('\t')
353 if mask[i][j]: outputfile.write(str(self.data[i][j]))
354 outputfile.write('\n')
355 outputfile.close()
356
359