1
2
3
4
5
6
7
8
9
10
11 """Nexus class. Parse the contents of a nexus file.
12 Based upon 'NEXUS: An extensible file format for systematic information'
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
14 """
15
16 import os,sys, math, random, copy
17 import sets
18
19 from Bio.Alphabet import IUPAC
20 from Bio.Data import IUPACData
21 from Bio.Seq import Seq
22
23 from Trees import Tree,NodeData
24
25 try:
26 import cnexus
27 except ImportError:
28 C=False
29 else:
30 C=True
31
32 INTERLEAVE=70
33 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
34 'matrix','tree', 'utree','translate','codonposset','title']
35 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
36 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
37 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
38 WHITESPACE=' \t\n'
39
40 SPECIALCOMMENTS=['&']
41 CHARSET='chars'
42 TAXSET='taxa'
43 CODONPOSITIONS='codonpositions'
44 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
46
48 """Helps reading NEXUS-words and characters from a buffer."""
50 if string:
51 self.buffer=list(string)
52 else:
53 self.buffer=[]
54
56 if self.buffer:
57 return self.buffer[0]
58 else:
59 return None
60
62 b=''.join(self.buffer).strip()
63 if b:
64 return b[0]
65 else:
66 return None
67
69 if self.buffer:
70 return self.buffer.pop(0)
71 else:
72 return None
73
75 while True:
76 p=self.next()
77 if p is None:
78 break
79 if p not in WHITESPACE:
80 return p
81 return None
82
84 while self.buffer[0] in WHITESPACE:
85 self.buffer=self.buffer[1:]
86
88 for t in target:
89 try:
90 pos=self.buffer.index(t)
91 except ValueError:
92 pass
93 else:
94 found=''.join(self.buffer[:pos])
95 self.buffer=self.buffer[pos:]
96 return found
97 else:
98 return None
99
101 return ''.join(self.buffer[:len(word)])==word
102
104 """Return the next NEXUS word from a string, dealing with single and double quotes,
105 whitespace and punctuation.
106 """
107
108 word=[]
109 quoted=False
110 first=self.next_nonwhitespace()
111 if not first:
112 return None
113 word.append(first)
114 if first=="'":
115 quoted=True
116 elif first in PUNCTUATION:
117 return first
118 while True:
119 c=self.peek()
120 if c=="'":
121 word.append(self.next())
122 if self.peek()=="'":
123 skip=self.next()
124 elif quoted:
125 break
126 elif quoted:
127 word.append(self.next())
128 elif not c or c in PUNCTUATION or c in WHITESPACE:
129 break
130 else:
131 word.append(self.next())
132 return ''.join(word)
133
135 """Return the rest of the string without parsing."""
136 return ''.join(self.buffer)
137
139 """Calculate a stepmatrix for weighted parsimony.
140 See Wheeler (1990), Cladistics 6:269-275.
141 """
142
144 self.data={}
145 self.symbols=[s for s in symbols]
146 self.symbols.sort()
147 if gap:
148 self.symbols.append(gap)
149 for x in self.symbols:
150 for y in [s for s in self.symbols if s!=x]:
151 self.set(x,y,0)
152
153 - def set(self,x,y,value):
157
158 - def add(self,x,y,value):
162
165
172
174 for k in self.data:
175 if self.data[k]!=0:
176 self.data[k]=-math.log(self.data[k])
177 return self
178
179 - def smprint(self,name='your_name_here'):
180 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
181 matrix+=' %s\n' % ' '.join(self.symbols)
182 for x in self.symbols:
183 matrix+='[%s]'.ljust(8) % x
184 for y in self.symbols:
185 if x==y:
186 matrix+=' . '
187 else:
188 if x>y:
189 x1,y1=y,x
190 else:
191 x1,y1=x,y
192 if self.data[x1+y1]==0:
193 matrix+='inf. '
194 else:
195 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
196 matrix+='\n'
197 matrix+=';\n'
198 return matrix
199
201 """Return a taxon identifier according to NEXUS standard.
202 Wrap quotes around names with punctuation or whitespace, and double single quotes.
203 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
204 """
205 if mrbayes:
206 safe=name.replace(' ','_')
207 safe=''.join([c for c in safe if c in MRBAYESSAFE])
208 else:
209 safe=name.replace("'","''")
210 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
211 safe="'"+safe+"'"
212 return safe
213
215 """Remove quotes and/or double quotes around identifiers."""
216 if not word:
217 return None
218 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
219 word=word[1:-1]
220 return word
221
238
240 """Returns a sorted list of keys of p sorted by values of p."""
241 startpos=[(p[pn],pn) for pn in p if p[pn]]
242 startpos.sort()
243 return zip(*startpos)[1]
244
246 """Check that all values in list are unique and return a pruned and sorted list."""
247 l=list(sets.Set(l))
248 l.sort()
249 return l
250
252 """Returns a unique name if label is already in previous_labels."""
253 while label in previous_labels:
254 if label.split('.')[-1].startswith('copy'):
255 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
256 else:
257 label+='.copy'
258 return label
259
261 """Converts a Seq-object matrix to a plain sequence-string matrix."""
262 return dict([(t,matrix[t].tostring()) for t in matrix])
263
265 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
266 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
267 """
268
269 if not orig_list:
270 return ''
271 orig_list=list(sets.Set(orig_list))
272 orig_list.sort()
273 shortlist=[]
274 clist=orig_list[:]
275 clist.append(clist[-1]+.5)
276 while len(clist)>1:
277 step=1
278 for i,x in enumerate(clist):
279 if x==clist[0]+i*step:
280 continue
281 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
282
283
284 step=x-clist[0]
285 else:
286 sub=clist[:i]
287 if len(sub)==1:
288 shortlist.append(str(sub[0]+1))
289 else:
290 if step==1:
291 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
292 else:
293 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
294 clist=clist[i:]
295 break
296 return ' '.join(shortlist)
297
299 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
300
301 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
302 Character sets, character partitions and taxon sets are prefixed, readjusted and present in
303 the combined matrix.
304 """
305
306 if not matrices:
307 return None
308 name=matrices[0][0]
309 combined=copy.deepcopy(matrices[0][1])
310 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
311 if mixed_datatypes:
312 combined.datatype='None'
313
314 combined.charlabels=None
315 combined.statelabels=None
316 combined.interleave=False
317 combined.translate=None
318
319
320 for cn,cs in combined.charsets.items():
321 combined.charsets['%s.%s' % (name,cn)]=cs
322 del combined.charsets[cn]
323 for tn,ts in combined.taxsets.items():
324 combined.taxsets['%s.%s' % (name,tn)]=ts
325 del combined.taxsets[tn]
326
327
328 combined.charpartitions={'combined':{name:range(combined.nchar)}}
329 for n,m in matrices[1:]:
330 both=[t for t in combined.taxlabels if t in m.taxlabels]
331 combined_only=[t for t in combined.taxlabels if t not in both]
332 m_only=[t for t in m.taxlabels if t not in both]
333 for t in both:
334
335 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
336
337 for t in combined_only:
338 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
339 for t in m_only:
340 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
341 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
342 combined.taxlabels.extend(m_only)
343 for cn,cs in m.charsets.items():
344 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
345 if m.taxsets:
346 if not combined.taxsets:
347 combined.taxsets={}
348 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()]))
349 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
350
351 if m.charlabels:
352 if not combined.charlabels:
353 combined.charlabels={}
354 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
355 combined.nchar+=m.nchar
356 combined.ntax+=len(m_only)
357
358
359
360 for c in combined.charpartitions['combined']:
361 combined.charsets[c]=combined.charpartitions['combined'][c]
362
363 return combined
364
423
424
426 """Adjust linebreaks to match ';', strip leading/trailing whitespace
427
428 list_of_commandlines=_adjust_lines(input_text)
429 Lines are adjusted so that no linebreaks occur within a commandline
430 (except matrix command line)
431 """
432 formatted_lines=[]
433 for l in lines:
434
435 l=l.replace('\r\n','\n').replace('\r','\n').strip()
436 if l.lower().startswith('matrix'):
437 formatted_lines.append(l)
438 else:
439 l=l.replace('\n',' ')
440 if l:
441 formatted_lines.append(l)
442 return formatted_lines
443
445 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
446
447 opening=seq.find('(')
448 while opening>-1:
449 closing=seq.find(')')
450 if closing<0:
451 raise NexusError, 'Missing closing parenthesis in: '+seq
452 elif closing<opening:
453 raise NexusError, 'Missing opening parenthesis in: '+seq
454 ambig=[x for x in seq[opening+1:closing]]
455 ambig.sort()
456 ambig=''.join(ambig)
457 ambig_code=rev_ambig_values[ambig.upper()]
458 if ambig!=ambig.upper():
459 ambig_code=ambig_code.lower()
460 seq=seq[:opening]+ambig_code+seq[closing+1:]
461 opening=seq.find('(')
462 return seq
463
465 """Represent a commandline as command and options."""
466
468 self.options={}
469 options=[]
470 self.command=None
471 try:
472
473 self.command, options = line.strip().split('\n', 1)
474 except ValueError:
475
476 self.command=line.split()[0]
477 options=' '.join(line.split()[1:])
478 self.command = self.command.strip().lower()
479 if self.command in SPECIAL_COMMANDS:
480 self.options=options.strip()
481 else:
482 if len(options) > 0:
483 try:
484 options = options.replace('=', ' = ').split()
485 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
486 indices = []
487 for sl in valued_indices:
488 indices.extend(sl)
489 token_indices = [n for n in range(len(options)) if n not in indices]
490 for opt in valued_indices:
491
492 self.options[options[opt[0]].lower()] = options[opt[2]]
493 for token in token_indices:
494 self.options[options[token].lower()] = None
495 except ValueError:
496 raise NexusError, 'Incorrect formatting in line: %s' % line
497
499 """Represent a NEXUS block with block name and list of commandlines ."""
503
505
506 __slots__=['original_taxon_order','__dict__']
507
509 self.ntax=0
510 self.nchar=0
511 self.unaltered_taxlabels=[]
512 self.taxlabels=[]
513 self.charlabels=None
514 self.statelabels=None
515 self.datatype='dna'
516 self.respectcase=False
517 self.missing='?'
518 self.gap='-'
519 self.symbols=None
520 self.equate=None
521 self.matchchar=None
522 self.labels=None
523 self.transpose=False
524 self.interleave=False
525 self.tokens=False
526 self.eliminate=None
527 self.matrix=None
528 self.unknown_blocks=[]
529 self.taxsets={}
530 self.charsets={}
531 self.charpartitions={}
532 self.taxpartitions={}
533 self.trees=[]
534 self.translate=None
535 self.structured=[]
536 self.set={}
537 self.options={}
538 self.codonposset=None
539
540
541 self.options['gapmode']='missing'
542
543 if input:
544 self.read(input)
545 else:
546 self.read(DEFAULTNEXUS)
547
549 """Included for backwards compatibility."""
550 return self.taxlabels
552 """Included for backwards compatibility."""
553 self.taxlabels=value
554 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
555
556 - def read(self,input):
557 """Read and parse NEXUS imput (filename, file-handle, string."""
558
559
560
561 try:
562 file_contents = open(os.path.expanduser(input),'rU').read()
563 self.filename=input
564 except (TypeError,IOError,AttributeError):
565
566 if isinstance(input, str):
567 file_contents = input
568 self.filename='input_string'
569
570 elif hasattr(input,'read'):
571 file_contents=input.read()
572 if hasattr(input,"name") and input.name:
573 self.filename=input.name
574 else:
575 self.filename='Unknown_nexus_file'
576 else:
577 print input.strip()[:50]
578 raise NexusError, 'Unrecognized input: %s ...' % input[:100]
579 file_contents=file_contents.strip()
580 if file_contents.startswith('#NEXUS'):
581 file_contents=file_contents[6:]
582 if C:
583 decommented=cnexus.scanfile(file_contents)
584
585 if decommented=='[' or decommented==']':
586 raise NexusError, 'Unmatched %s' % decommented
587
588
589 commandlines=_adjust_lines(decommented.split(chr(7)))
590 else:
591 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
592
593 try:
594 if commandlines[0][:6].upper()=='#NEXUS':
595 commandlines[0]=commandlines[0][6:].strip()
596 except:
597 pass
598
599 nexus_block_gen = self._get_nexus_block(commandlines)
600 while 1:
601 try:
602 title, contents = nexus_block_gen.next()
603 except StopIteration:
604 break
605 if title in KNOWN_NEXUS_BLOCKS:
606 self._parse_nexus_block(title, contents)
607 else:
608 self._unknown_nexus_block(title, contents)
609
611 """Generator for looping through Nexus blocks."""
612 inblock=False
613 blocklines=[]
614 while file_contents:
615 cl=file_contents.pop(0)
616 if cl.lower().startswith('begin'):
617 if not inblock:
618 inblock=True
619 title=cl.split()[1].lower()
620 else:
621 raise NexusError('Illegal block nesting in block %s' % title)
622 elif cl.lower().startswith('end'):
623 if inblock:
624 inblock=False
625 yield title,blocklines
626 blocklines=[]
627 else:
628 raise NexusError('Unmatched \'end\'.')
629 elif inblock:
630 blocklines.append(cl)
631
637
639 """Parse a known Nexus Block """
640
641 self._apply_block_structure(title, contents)
642
643
644 block=self.structured[-1]
645 for line in block.commandlines:
646 try:
647 getattr(self,'_'+line.command)(line.options)
648 except AttributeError:
649 raise
650 raise NexusError, 'Unknown command: %s ' % line.command
651
654
656 if options.has_key('ntax'):
657 self.ntax=eval(options['ntax'])
658 if options.has_key('nchar'):
659 self.nchar=eval(options['nchar'])
660
740
741
742 - def _set(self,options):
744
746 self.options=options;
747
749 self.eliminate=options
750
752 """Get taxon labels. As the taxon names are already in the matrix, this is superfluous except for transpose matrices,
753 which are currently unsupported anyway.
754 Thus, we ignore the taxlabels command to make handling of duplicate taxon names easier.
755 """
756 pass
757
758
759
760
761
762
763
764
766 """Check for presence of taxon in self.taxlabels."""
767
768
769 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
770 nexid=taxon.replace(' ','_')
771 return nextaxa.get(nexid)
772
795
799
804
806 if not self.ntax or not self.nchar:
807 raise NexusError,'Dimensions must be specified before matrix!'
808 self.matrix={}
809 taxcount=0
810 first_matrix_block=True
811
812
813 lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
814 lineiter=iter(lines)
815 while 1:
816 try:
817 l=lineiter.next()
818 except StopIteration:
819 if taxcount<self.ntax:
820 raise NexusError, 'Not enough taxa in matrix.'
821 elif taxcount>self.ntax:
822 raise NexusError, 'Too many taxa in matrix.'
823 else:
824 break
825
826 taxcount+=1
827
828 if taxcount>self.ntax:
829 if not self.interleave:
830 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
831 else:
832 taxcount=1
833 first_matrix_block=False
834
835 linechars=CharBuffer(l)
836 id=quotestrip(linechars.next_word())
837 l=linechars.rest().strip()
838 chars=''
839 if self.interleave:
840
841
842 if l:
843 chars=''.join(l.split())
844 else:
845 chars=''.join(lineiter.next().split())
846 else:
847
848 chars=''.join(l.split())
849 while len(chars)<self.nchar:
850 l=lineiter.next()
851 chars+=''.join(l.split())
852 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
853
854 if taxcount==1:
855 refseq=iupac_seq
856 else:
857 if self.matchchar:
858 while 1:
859 p=iupac_seq.tostring().find(self.matchchar)
860 if p==-1:
861 break
862 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
863
864 for i,c in enumerate(iupac_seq.tostring()):
865 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
866 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
867 % (id,c,l[i-10:i+10])
868
869 if first_matrix_block:
870 self.unaltered_taxlabels.append(id)
871 id=_unique_label(self.matrix.keys(),id)
872 self.matrix[id]=iupac_seq
873 self.taxlabels.append(id)
874 else:
875
876 id=_unique_label(self.taxlabels[:taxcount-1],id)
877 taxon_present=self._check_taxlabels(id)
878 if taxon_present:
879 self.matrix[taxon_present]+=iupac_seq
880 else:
881 raise NexusError, 'Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id
882
883 for taxon in self.matrix:
884 if len(self.matrix[taxon])!=self.nchar:
885 raise NexusError,'Matrx Nchar %d does not match data length (%d) for taxon %s' % (self.nchar, len(self.matrix[taxon]),taxon)
886
887 matrixkeys=self.matrix.keys()
888 matrixkeys.sort()
889 taxlabelssort=self.taxlabels[:]
890 taxlabelssort.sort()
891 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
892
912
914 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
915 self._tree(options)
916
917 - def _tree(self,options):
947
954
958
962
964 taxpartition={}
965 quotelevel=False
966 opts=CharBuffer(options)
967 name=self._name_n_vector(opts)
968 if not name:
969 raise NexusError, 'Formatting error in taxpartition: %s ' % options
970
971
972
973 sub=''
974 while True:
975 w=opts.next()
976 if w is None or (w==',' and not quotelevel):
977 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
978 taxpartition[subname]=_make_unique(subindices)
979 sub=''
980 if w is None:
981 break
982 else:
983 if w=="'":
984 quotelevel=not quotelevel
985 sub+=w
986 self.taxpartitions[name]=taxpartition
987
989 """Read codon positions from a codons block as written from McClade.
990 Here codonposset is just a fancy name for a character partition with the name CodonPositions and the partitions N,1,2,3
991 """
992
993 prev_partitions=self.charpartitions.keys()
994 self._charpartition(options)
995
996 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions]
997 if codonname==[] or len(codonname)>1:
998 raise NexusError, 'Formatting Error in codonposset: %s ' % options
999 else:
1000 self.codonposset=codonname[0]
1001
1004
1006 charpartition={}
1007 quotelevel=False
1008 opts=CharBuffer(options)
1009 name=self._name_n_vector(opts)
1010 if not name:
1011 raise NexusError, 'Formatting error in charpartition: %s ' % options
1012
1013
1014 sub=''
1015 while True:
1016 w=opts.next()
1017 if w is None or (w==',' and not quotelevel):
1018 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1019 charpartition[subname]=_make_unique(subindices)
1020 sub=''
1021 if w is None:
1022 break
1023 else:
1024 if w=="'":
1025 quotelevel=not quotelevel
1026 sub+=w
1027 self.charpartitions[name]=charpartition
1028
1030 """Parse the taxset/charset specification
1031 '1 2 3 - 5 dog cat 10 - 20 \\ 3' --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1032 """
1033 opts=CharBuffer(options)
1034 name=self._name_n_vector(opts,separator=separator)
1035 indices=self._parse_list(opts,set_type=set_type)
1036 if indices is None:
1037 raise NexusError, 'Formatting error in line: %s ' % options
1038 return name,indices
1039
1061
1063 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1064 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1065 """
1066 plain_list=[]
1067 if options_buffer.peek_nonwhitespace():
1068 try:
1069 while True:
1070 identifier=options_buffer.next_word()
1071 if not identifier:
1072 break
1073 start=self._resolve(identifier,set_type=set_type)
1074 if options_buffer.peek_nonwhitespace()=='-':
1075 end=start
1076 step=1
1077
1078 hyphen=options_buffer.next_nonwhitespace()
1079 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1080 if set_type==CHARSET:
1081 if options_buffer.peek_nonwhitespace()=='\\':
1082 backslash=options_buffer.next_nonwhitespace()
1083 step=int(options_buffer.next_word())
1084 plain_list.extend(range(start,end+1,step))
1085 else:
1086 if type(start)==list or type(end)==list:
1087 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
1088 start=self.taxlabels.index(start)
1089 end=self.taxlabels.index(end)
1090 taxrange=self.taxlabels[start:end+1]
1091 plain_list.extend(taxrange)
1092 else:
1093 if type(start)==list:
1094 plain_list.extend(start)
1095 else:
1096 plain_list.append(start)
1097 except NexusError:
1098 raise
1099 except:
1100 return None
1101 return plain_list
1102
1103 - def _resolve(self,identifier,set_type=None):
1104 """Translate identifier in list into character/taxon index.
1105 Characters (which are referred to by their index in Nexus.py):
1106 Plain numbers are returned minus 1 (Nexus indices to python indices)
1107 Text identifiers are translaterd into their indices (if plain character indentifiers),
1108 the first hit in charlabels is returned (charlabels don't need to be unique)
1109 or the range of indices is returned (if names of character sets).
1110 Taxa (which are referred to by their unique name in Nexus.py):
1111 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1112 Names are returned unchanged (if plain taxon identifiers), or the names in
1113 the corresponding taxon set is returned
1114 """
1115 identifier=quotestrip(identifier)
1116 if not set_type:
1117 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1118 if set_type==CHARSET:
1119 try:
1120 n=int(identifier)
1121 except ValueError:
1122 if self.charlabels and identifier in self.charlabels.values():
1123 for k in self.charlabels:
1124 if self.charlabels[k]==identifier:
1125 return k
1126 elif self.charsets and identifier in self.charsets:
1127 return self.charsets[identifier]
1128 else:
1129 raise NexusError, 'Unknown character identifier: %s' % identifier
1130 else:
1131 if n<=self.nchar:
1132 return n-1
1133 else:
1134 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
1135 elif set_type==TAXSET:
1136 try:
1137 n=int(identifier)
1138 except ValueError:
1139 taxlabels_id=self._check_taxlabels(identifier)
1140 if taxlabels_id:
1141 return taxlabels_id
1142 elif self.taxsets and identifier in self.taxsets:
1143 return self.taxsets[identifier]
1144 else:
1145 raise NexusError, 'Unknown taxon identifier: %s' % identifier
1146 else:
1147 if n>0 and n<=self.ntax:
1148 return self.taxlabels[n-1]
1149 else:
1150 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
1151 else:
1152 raise NexusError('Unknown set specification: %s.'% set_type)
1153
1157
1161
1165
1169
1172 """Writes a nexus file for each partition in charpartition.
1173 Only non-excluded characters and non-deleted taxa are included, just the data block is written.
1174 """
1175
1176 if not matrix:
1177 matrix=self.matrix
1178 if not matrix:
1179 return
1180 if not filename:
1181 filename=self.filename
1182 if charpartition:
1183 pfilenames={}
1184 for p in charpartition:
1185 total_exclude=[]+exclude
1186 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1187 total_exclude=_make_unique(total_exclude)
1188 pcomment=comment+'\nPartition: '+p+'\n'
1189 dot=filename.rfind('.')
1190 if dot>0:
1191 pfilename=filename[:dot]+'_'+p+'.data'
1192 else:
1193 pfilename=filename+'_'+p
1194 pfilenames[p]=pfilename
1195 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1196 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1197 mrbayes=mrbayes)
1198 return pfilenames
1199 else:
1200 fn=self.filename+'.data'
1201 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1202 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1203 mrbayes=mrbayes)
1204 return fn
1205
1206 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1207 blocksize=None, interleave=False, interleave_by_partition=False,\
1208 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1209 codons_block=True):
1210 """Writes a nexus file with data and sets block to a file or handle.
1211
1212 Character sets and partitions are appended by default, and are
1213 adjusted according to excluded characters (i.e. character sets
1214 still point to the same sites (not necessarily same positions),
1215 without including the deleted characters.
1216
1217 filename - Either a filename as a string (which will be opened,
1218 written to and closed), or a handle object (which will
1219 be written to but NOT closed).
1220 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1221 start of the file is ommited.
1222
1223 Returns the filename/handle used to write the data.
1224 """
1225 if not matrix:
1226 matrix=self.matrix
1227 if not matrix:
1228 return
1229 if not filename:
1230 filename=self.filename
1231 if [t for t in delete if not self._check_taxlabels(t)]:
1232 raise NexusError, 'Unknown taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
1233 if interleave_by_partition:
1234 if not interleave_by_partition in self.charpartitions:
1235 raise NexusError, 'Unknown partition: '+interleave_by_partition
1236 else:
1237 partition=self.charpartitions[interleave_by_partition]
1238
1239 names=_sort_keys_by_values(partition)
1240 newpartition={}
1241 for p in partition:
1242 newpartition[p]=[c for c in partition[p] if c not in exclude]
1243
1244 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1245 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1246 ntax_adjusted=len(undelete)
1247 nchar_adjusted=len(cropped_matrix[undelete[0]])
1248 if not undelete or (undelete and undelete[0]==''):
1249 return
1250 if isinstance(filename,str):
1251 try:
1252 fh=open(filename,'w')
1253 except IOError:
1254 raise NexusError, 'Could not open %s for writing.' % filename
1255 elif hasattr(file, "write"):
1256 fh=filename
1257 else :
1258 raise ValueError, "Neither a filename nor a handle was supplied"
1259 if not omit_NEXUS:
1260 fh.write('#NEXUS\n')
1261 if comment:
1262 fh.write('['+comment+']\n')
1263 fh.write('begin data;\n')
1264 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1265 fh.write('\tformat datatype='+self.datatype)
1266 if self.respectcase:
1267 fh.write(' respectcase')
1268 if self.missing:
1269 fh.write(' missing='+self.missing)
1270 if self.gap:
1271 fh.write(' gap='+self.gap)
1272 if self.matchchar:
1273 fh.write(' matchchar='+self.matchchar)
1274 if self.labels:
1275 fh.write(' labels='+self.labels)
1276 if self.equate:
1277 fh.write(' equate='+self.equate)
1278 if interleave or interleave_by_partition:
1279 fh.write(' interleave')
1280 fh.write(';\n')
1281
1282
1283 if self.charlabels:
1284 newcharlabels=self._adjust_charlabels(exclude=exclude)
1285 clkeys=newcharlabels.keys()
1286 clkeys.sort()
1287 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1288 fh.write('matrix\n')
1289 if not blocksize:
1290 if interleave:
1291 blocksize=70
1292 else:
1293 blocksize=self.nchar
1294
1295 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1296 if interleave_by_partition:
1297
1298 seek=0
1299 for p in names:
1300 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1301 if len(newpartition[p])>0:
1302 for taxon in undelete:
1303 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1304 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1305 fh.write('\n')
1306 else:
1307 fh.write('[empty]\n\n')
1308 seek+=len(newpartition[p])
1309 elif interleave:
1310 for seek in range(0,nchar_adjusted,blocksize):
1311 for taxon in undelete:
1312 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1313 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1314 fh.write('\n')
1315 else:
1316 for taxon in undelete:
1317 if blocksize<nchar_adjusted:
1318 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1319 else:
1320 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1321 for seek in range(0,nchar_adjusted,blocksize):
1322 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1323 fh.write(';\nend;\n')
1324 if append_sets:
1325 if codons_block:
1326 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1327 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1328 else:
1329 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1330
1331 if fh == filename :
1332
1333 return filename
1334 else :
1335
1336 fh.close()
1337 return filename
1338
1340 """Returns a sets block"""
1341 if not self.charsets and not self.taxsets and not self.charpartitions:
1342 return ''
1343 if codons_only:
1344 sets=['\nbegin codons']
1345 else:
1346 sets=['\nbegin sets']
1347
1348
1349
1350
1351 offset=0
1352 offlist=[]
1353 for c in range(self.nchar):
1354 if c in exclude:
1355 offset+=1
1356 offlist.append(-1)
1357 else:
1358 offlist.append(c-offset)
1359
1360 if not codons_only:
1361 for n,ns in self.charsets.items():
1362 cset=[offlist[c] for c in ns if c not in exclude]
1363 if cset:
1364 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1365 for n,s in self.taxsets.items():
1366 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1367 if tset:
1368 sets.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1369 for n,p in self.charpartitions.items():
1370 if not include_codons and n==CODONPOSITIONS:
1371 continue
1372 elif codons_only and n!=CODONPOSITIONS:
1373 continue
1374
1375
1376
1377 names=_sort_keys_by_values(p)
1378 newpartition={}
1379 for sn in names:
1380 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1381 if nsp:
1382 newpartition[sn]=nsp
1383 if newpartition:
1384 if include_codons and n==CODONPOSITIONS:
1385 command='codonposset'
1386 else:
1387 command='charpartition'
1388 sets.append('%s %s = %s' % (command,safename(n),\
1389 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1390
1391 for n,p in self.taxpartitions.items():
1392 names=_sort_keys_by_values(p)
1393 newpartition={}
1394 for sn in names:
1395 nsp=[t for t in p[sn] if t not in delete]
1396 if nsp:
1397 newpartition[sn]=nsp
1398 if newpartition:
1399 sets.append('taxpartition %s = %s' % (safename(n),\
1400 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1401
1402 sets.append('end;\n')
1403 if len(sets)==2:
1404 return ''
1405 else:
1406 return ';\n'.join(sets)
1407
1409 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1410 if not filename:
1411 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1412 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1413 else:
1414 filename=self.filename+'.fas'
1415 fh=open(filename,'w')
1416 for taxon in self.taxlabels:
1417 fh.write('>'+safename(taxon)+'\n')
1418 for i in range(0, len(self.matrix[taxon].tostring()), width):
1419 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1420 fh.close()
1421
1423 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1424 if not filename:
1425 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1426 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1427 else:
1428 filename=self.filename+'.phy'
1429 fh=open(filename,'w')
1430 fh.write('%d %d\n' % (self.ntax,self.nchar))
1431 for taxon in self.taxlabels:
1432 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1433 fh.close()
1434
1435 - def constant(self,matrix=None,delete=[],exclude=[]):
1436 """Return a list with all constant characters."""
1437 if not matrix:
1438 matrix=self.matrix
1439 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1440 if not undelete:
1441 return None
1442 elif len(undelete)==1:
1443 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1444
1445 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1446 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1447 for taxon in undelete[1:]:
1448 newconstant=[]
1449 for site in constant:
1450
1451 seqsite=matrix[taxon][site[0]].upper()
1452
1453 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1454
1455 newconstant.append(site)
1456 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1457
1458 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1459 elif seqsite in self.ambiguous_values:
1460 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
1461 if intersect:
1462 newconstant.append((site[0],''.join(intersect)))
1463
1464
1465
1466
1467
1468 constant=newconstant
1469 cpos=[s[0] for s in constant]
1470 return constant
1471
1472
1474 """Summarize character.
1475 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1476 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1477 """
1478 undelete=[t for t in self.taxlabels if t not in delete]
1479 if not undelete:
1480 return None
1481 cstatus=[]
1482 for t in undelete:
1483 c=self.matrix[t][site].upper()
1484 if self.options.get('gapmode')=='missing' and c==self.gap:
1485 c=self.missing
1486 if narrow and c==self.missing:
1487 if c not in cstatus:
1488 cstatus.append(c)
1489 else:
1490 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1491 if self.missing in cstatus and narrow and len(cstatus)>1:
1492 cstatus=[c for c in cstatus if c!=self.missing]
1493 cstatus.sort()
1494 return cstatus
1495
1497 """Calculates a stepmatrix for weighted parsimony.
1498 See Wheeler (1990), Cladistics 6:269-275 and
1499 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1500 """
1501 m=StepMatrix(self.unambiguous_letters,self.gap)
1502 for site in [s for s in range(self.nchar) if s not in exclude]:
1503 cstatus=self.cstatus(site,delete)
1504 for i,b1 in enumerate(cstatus[:-1]):
1505 for b2 in cstatus[i+1:]:
1506 m.add(b1.upper(),b2.upper(),1)
1507 return m.transformation().weighting().smprint(name=name)
1508
1509 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1510 """Return a matrix without deleted taxa and excluded characters."""
1511 if not matrix:
1512 matrix=self.matrix
1513 if [t for t in delete if not self._check_taxlabels(t)]:
1514 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
1515 if exclude!=[]:
1516 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1517 if not undelete:
1518 return {}
1519 m=[matrix[k].tostring() for k in undelete]
1520 zipped_m=zip(*m)
1521 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1522 if sitesm==[]:
1523 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1524 else:
1525 zipped_sitesm=zip(*sitesm)
1526 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1527 return dict(zip(undelete,m))
1528 else:
1529 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1530
1531 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1532 """Return a bootstrapped matrix."""
1533 if not matrix:
1534 matrix=self.matrix
1535 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1536 cm=self.crop_matrix(delete=delete,exclude=exclude)
1537 if not cm:
1538 return {}
1539 elif len(cm[cm.keys()[0]])==0:
1540 return cm
1541 undelete=[t for t in self.taxlabels if t in cm]
1542 if seqobjects:
1543 sitesm=zip(*[cm[t].tostring() for t in undelete])
1544 alphabet=matrix[matrix.keys()[0]].alphabet
1545 else:
1546 sitesm=zip(*[cm[t] for t in undelete])
1547 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1548 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1549 if seqobjects:
1550 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1551 return dict(zip(undelete,bootstrapseqs))
1552
1554 """Adds a sequence to the matrix."""
1555
1556 if not name:
1557 raise NexusError, 'New sequence must have a name'
1558
1559 diff=self.nchar-len(sequence)
1560 if diff<0:
1561 self.insert_gap(self.nchar,-diff)
1562 elif diff>0:
1563 sequence+=self.missing*diff
1564
1565 if name in self.taxlabels:
1566 unique_name=_unique_label(self.taxlabels,name)
1567 print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name)
1568 else:
1569 unique_name=name
1570
1571 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1572
1573 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1574 self.ntax+=1
1575 self.taxlabels.append(unique_name)
1576 self.unaltered_taxlabels.append(name)
1577
1579 """Add a gap into the matrix and adjust charsets and partitions.
1580
1581 pos=0: first position
1582 pos=nchar: last position
1583 """
1584
1585 def _adjust(set,x,d,leftgreedy=False):
1586 """Adjusts chartacter sets if gaps are inserted, taking care of
1587 new gaps within a coherent character set."""
1588
1589
1590
1591 set.sort()
1592 addpos=0
1593 for i,c in enumerate(set):
1594 if c>=x:
1595 set[i]=c+d
1596
1597 if c==x:
1598 if leftgreedy or (i>0 and set[i-1]==c-1):
1599 addpos=i
1600 if addpos>0:
1601 set[addpos:addpos]=range(x,x+d)
1602 return set
1603
1604 if pos<0 or pos>self.nchar:
1605 raise NexusError('Illegal gap position: %d' % pos)
1606 if n==0:
1607 return
1608 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1609 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1610
1611
1612 zipped=zip(*sitesm)
1613 mapped=map(''.join,zipped)
1614 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1615 self.matrix=dict(listed)
1616 self.nchar+=n
1617
1618 for i,s in self.charsets.items():
1619 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1620 for p in self.charpartitions:
1621 for sp,s in self.charpartitions[p].items():
1622 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1623
1624 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1625 return self.charlabels
1626
1628 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1629 if exclude and insert:
1630 raise NexusError, 'Can\'t exclude and insert at the same time'
1631 if not self.charlabels:
1632 return None
1633 labels=self.charlabels.keys()
1634 labels.sort()
1635 newcharlabels={}
1636 if exclude:
1637 exclude.sort()
1638 exclude.append(sys.maxint)
1639 excount=0
1640 for c in labels:
1641 if not c in exclude:
1642 while c>exclude[excount]:
1643 excount+=1
1644 newcharlabels[c-excount]=self.charlabels[c]
1645 elif insert:
1646 insert.sort()
1647 insert.append(sys.maxint)
1648 icount=0
1649 for c in labels:
1650 while c>=insert[icount]:
1651 icount+=1
1652 newcharlabels[c+icount]=self.charlabels[c]
1653 else:
1654 return self.charlabels
1655 return newcharlabels
1656
1658 """Returns all character indices that are not in charlist."""
1659 return [c for c in range(self.nchar) if c not in charlist]
1660
1662 """Return gap-only sites."""
1663 gap=sets.Set(self.gap)
1664 if include_missing:
1665 gap.add(self.missing)
1666 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1667 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
1668 return gaponly
1669
1691