1
2
3
4
5
6
7
8
9
10
11 """Nexus class. Parse the contents of a nexus file.
12 Based upon 'NEXUS: An extensible file format for systematic information'
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
14 """
15
16 import os,sys, math, random, copy
17
18 from Bio.Alphabet import IUPAC
19 from Bio.Data import IUPACData
20 from Bio.Seq import Seq
21
22 from Trees import Tree,NodeData
23
24 try:
25 import cnexus
26 except ImportError:
27 C=False
28 else:
29 C=True
30
31 try:
32
33 set = set
34 except NameError:
35
36 from sets import Set as set
37
38 INTERLEAVE=70
39 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
40 'matrix','tree', 'utree','translate','codonposset','title']
41 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
42 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
43 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
44 WHITESPACE=' \t\n'
45
46 SPECIALCOMMENTS=['&']
47 CHARSET='chars'
48 TAXSET='taxa'
49 CODONPOSITIONS='codonpositions'
50 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
52
54 """Helps reading NEXUS-words and characters from a buffer."""
56 if string:
57 self.buffer=list(string)
58 else:
59 self.buffer=[]
60
62 if self.buffer:
63 return self.buffer[0]
64 else:
65 return None
66
68 b=''.join(self.buffer).strip()
69 if b:
70 return b[0]
71 else:
72 return None
73
75 if self.buffer:
76 return self.buffer.pop(0)
77 else:
78 return None
79
81 while True:
82 p=self.next()
83 if p is None:
84 break
85 if p not in WHITESPACE:
86 return p
87 return None
88
90 while self.buffer[0] in WHITESPACE:
91 self.buffer=self.buffer[1:]
92
94 for t in target:
95 try:
96 pos=self.buffer.index(t)
97 except ValueError:
98 pass
99 else:
100 found=''.join(self.buffer[:pos])
101 self.buffer=self.buffer[pos:]
102 return found
103 else:
104 return None
105
107 return ''.join(self.buffer[:len(word)])==word
108
110 """Return the next NEXUS word from a string.
111
112 This deals with single and double quotes, whitespace and punctuation.
113 """
114
115 word=[]
116 quoted=False
117 first=self.next_nonwhitespace()
118 if not first:
119 return None
120 word.append(first)
121 if first=="'":
122 quoted=True
123 elif first in PUNCTUATION:
124 return first
125 while True:
126 c=self.peek()
127 if c=="'":
128 word.append(self.next())
129 if self.peek()=="'":
130 skip=self.next()
131 elif quoted:
132 break
133 elif quoted:
134 word.append(self.next())
135 elif not c or c in PUNCTUATION or c in WHITESPACE:
136 break
137 else:
138 word.append(self.next())
139 return ''.join(word)
140
142 """Return the rest of the string without parsing."""
143 return ''.join(self.buffer)
144
146 """Calculate a stepmatrix for weighted parsimony.
147
148 See Wheeler (1990), Cladistics 6:269-275.
149 """
150
152 self.data={}
153 self.symbols=[s for s in symbols]
154 self.symbols.sort()
155 if gap:
156 self.symbols.append(gap)
157 for x in self.symbols:
158 for y in [s for s in self.symbols if s!=x]:
159 self.set(x,y,0)
160
161 - def set(self,x,y,value):
165
166 - def add(self,x,y,value):
170
173
180
182 for k in self.data:
183 if self.data[k]!=0:
184 self.data[k]=-math.log(self.data[k])
185 return self
186
187 - def smprint(self,name='your_name_here'):
188 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
189 matrix+=' %s\n' % ' '.join(self.symbols)
190 for x in self.symbols:
191 matrix+='[%s]'.ljust(8) % x
192 for y in self.symbols:
193 if x==y:
194 matrix+=' . '
195 else:
196 if x>y:
197 x1,y1=y,x
198 else:
199 x1,y1=x,y
200 if self.data[x1+y1]==0:
201 matrix+='inf. '
202 else:
203 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
204 matrix+='\n'
205 matrix+=';\n'
206 return matrix
207
209 """Return a taxon identifier according to NEXUS standard.
210
211 Wrap quotes around names with punctuation or whitespace, and double
212 single quotes.
213
214 mrbayes=True: write names without quotes, whitespace or punctuation
215 for the mrbayes software package.
216 """
217 if mrbayes:
218 safe=name.replace(' ','_')
219 safe=''.join([c for c in safe if c in MRBAYESSAFE])
220 else:
221 safe=name.replace("'","''")
222 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
223 safe="'"+safe+"'"
224 return safe
225
227 """Remove quotes and/or double quotes around identifiers."""
228 if not word:
229 return None
230 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
231 word=word[1:-1]
232 return word
233
252
254 """Returns a sorted list of keys of p sorted by values of p."""
255 startpos=[(p[pn],pn) for pn in p if p[pn]]
256 startpos.sort()
257 return zip(*startpos)[1]
258
260 """Check that all values in list are unique and return a pruned and sorted list."""
261 l=list(set(l))
262 l.sort()
263 return l
264
266 """Returns a unique name if label is already in previous_labels."""
267 while label in previous_labels:
268 if label.split('.')[-1].startswith('copy'):
269 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
270 else:
271 label+='.copy'
272 return label
273
275 """Converts a Seq-object matrix to a plain sequence-string matrix."""
276 return dict([(t,matrix[t].tostring()) for t in matrix])
277
279 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
280 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
281 """
282
283 if not orig_list:
284 return ''
285 orig_list=list(set(orig_list))
286 orig_list.sort()
287 shortlist=[]
288 clist=orig_list[:]
289 clist.append(clist[-1]+.5)
290 while len(clist)>1:
291 step=1
292 for i,x in enumerate(clist):
293 if x==clist[0]+i*step:
294 continue
295 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
296
297
298 step=x-clist[0]
299 else:
300 sub=clist[:i]
301 if len(sub)==1:
302 shortlist.append(str(sub[0]+1))
303 else:
304 if step==1:
305 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
306 else:
307 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
308 clist=clist[i:]
309 break
310 return ' '.join(shortlist)
311
313 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
314
315 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
316 Character sets, character partitions and taxon sets are prefixed, readjusted
317 and present in the combined matrix.
318 """
319
320 if not matrices:
321 return None
322 name=matrices[0][0]
323 combined=copy.deepcopy(matrices[0][1])
324 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
325 if mixed_datatypes:
326 combined.datatype='None'
327
328 combined.charlabels=None
329 combined.statelabels=None
330 combined.interleave=False
331 combined.translate=None
332
333
334 for cn,cs in combined.charsets.items():
335 combined.charsets['%s.%s' % (name,cn)]=cs
336 del combined.charsets[cn]
337 for tn,ts in combined.taxsets.items():
338 combined.taxsets['%s.%s' % (name,tn)]=ts
339 del combined.taxsets[tn]
340
341
342 combined.charpartitions={'combined':{name:range(combined.nchar)}}
343 for n,m in matrices[1:]:
344 both=[t for t in combined.taxlabels if t in m.taxlabels]
345 combined_only=[t for t in combined.taxlabels if t not in both]
346 m_only=[t for t in m.taxlabels if t not in both]
347 for t in both:
348
349 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
350
351 for t in combined_only:
352 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
353 for t in m_only:
354 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
355 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
356 combined.taxlabels.extend(m_only)
357 for cn,cs in m.charsets.items():
358 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
359 if m.taxsets:
360 if not combined.taxsets:
361 combined.taxsets={}
362 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()]))
363 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
364
365 if m.charlabels:
366 if not combined.charlabels:
367 combined.charlabels={}
368 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
369 combined.nchar+=m.nchar
370 combined.ntax+=len(m_only)
371
372
373
374 for c in combined.charpartitions['combined']:
375 combined.charsets[c]=combined.charpartitions['combined'][c]
376
377 return combined
378
437
438
440 """Adjust linebreaks to match ';', strip leading/trailing whitespace.
441
442 list_of_commandlines=_adjust_lines(input_text)
443 Lines are adjusted so that no linebreaks occur within a commandline
444 (except matrix command line)
445 """
446 formatted_lines=[]
447 for l in lines:
448
449 l=l.replace('\r\n','\n').replace('\r','\n').strip()
450 if l.lower().startswith('matrix'):
451 formatted_lines.append(l)
452 else:
453 l=l.replace('\n',' ')
454 if l:
455 formatted_lines.append(l)
456 return formatted_lines
457
459 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
460
461 opening=seq.find('(')
462 while opening>-1:
463 closing=seq.find(')')
464 if closing<0:
465 raise NexusError('Missing closing parenthesis in: '+seq)
466 elif closing<opening:
467 raise NexusError('Missing opening parenthesis in: '+seq)
468 ambig=[x for x in seq[opening+1:closing]]
469 ambig.sort()
470 ambig=''.join(ambig)
471 ambig_code=rev_ambig_values[ambig.upper()]
472 if ambig!=ambig.upper():
473 ambig_code=ambig_code.lower()
474 seq=seq[:opening]+ambig_code+seq[closing+1:]
475 opening=seq.find('(')
476 return seq
477
479 """Represent a commandline as command and options."""
480
482 self.options={}
483 options=[]
484 self.command=None
485 try:
486
487 self.command, options = line.strip().split('\n', 1)
488 except ValueError:
489
490 self.command=line.split()[0]
491 options=' '.join(line.split()[1:])
492 self.command = self.command.strip().lower()
493 if self.command in SPECIAL_COMMANDS:
494 self.options=options.strip()
495 else:
496 if len(options) > 0:
497 try:
498 options = options.replace('=', ' = ').split()
499 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
500 indices = []
501 for sl in valued_indices:
502 indices.extend(sl)
503 token_indices = [n for n in range(len(options)) if n not in indices]
504 for opt in valued_indices:
505
506 self.options[options[opt[0]].lower()] = options[opt[2]]
507 for token in token_indices:
508 self.options[options[token].lower()] = None
509 except ValueError:
510 raise NexusError('Incorrect formatting in line: %s' % line)
511
513 """Represent a NEXUS block with block name and list of commandlines."""
517
519
520 __slots__=['original_taxon_order','__dict__']
521
523 self.ntax=0
524 self.nchar=0
525 self.unaltered_taxlabels=[]
526 self.taxlabels=[]
527 self.charlabels=None
528 self.statelabels=None
529 self.datatype='dna'
530 self.respectcase=False
531 self.missing='?'
532 self.gap='-'
533 self.symbols=None
534 self.equate=None
535 self.matchchar=None
536 self.labels=None
537 self.transpose=False
538 self.interleave=False
539 self.tokens=False
540 self.eliminate=None
541 self.matrix=None
542 self.unknown_blocks=[]
543 self.taxsets={}
544 self.charsets={}
545 self.charpartitions={}
546 self.taxpartitions={}
547 self.trees=[]
548 self.translate=None
549 self.structured=[]
550 self.set={}
551 self.options={}
552 self.codonposset=None
553
554
555 self.options['gapmode']='missing'
556
557 if input:
558 self.read(input)
559 else:
560 self.read(DEFAULTNEXUS)
561
563 """Included for backwards compatibility (DEPRECATED)."""
564 return self.taxlabels
566 """Included for backwards compatibility (DEPRECATED)."""
567 self.taxlabels=value
568 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
569
570 - def read(self,input):
571 """Read and parse NEXUS imput (a filename, file-handle, or string)."""
572
573
574
575 try:
576 file_contents = open(os.path.expanduser(input),'rU').read()
577 self.filename=input
578 except (TypeError,IOError,AttributeError):
579
580 if isinstance(input, str):
581 file_contents = input
582 self.filename='input_string'
583
584 elif hasattr(input,'read'):
585 file_contents=input.read()
586 if hasattr(input,"name") and input.name:
587 self.filename=input.name
588 else:
589 self.filename='Unknown_nexus_file'
590 else:
591 print input.strip()[:50]
592 raise NexusError('Unrecognized input: %s ...' % input[:100])
593 file_contents=file_contents.strip()
594 if file_contents.startswith('#NEXUS'):
595 file_contents=file_contents[6:]
596 if C:
597 decommented=cnexus.scanfile(file_contents)
598
599 if decommented=='[' or decommented==']':
600 raise NexusError('Unmatched %s' % decommented)
601
602
603 commandlines=_adjust_lines(decommented.split(chr(7)))
604 else:
605 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
606
607 for i,cl in enumerate(commandlines):
608 try:
609 if cl[:6].upper()=='#NEXUS':
610 commandlines[i]=cl[6:].strip()
611 except:
612 pass
613
614 nexus_block_gen = self._get_nexus_block(commandlines)
615 while 1:
616 try:
617 title, contents = nexus_block_gen.next()
618 except StopIteration:
619 break
620 if title in KNOWN_NEXUS_BLOCKS:
621 self._parse_nexus_block(title, contents)
622 else:
623 self._unknown_nexus_block(title, contents)
624
626 """Generator for looping through Nexus blocks."""
627 inblock=False
628 blocklines=[]
629 while file_contents:
630 cl=file_contents.pop(0)
631 if cl.lower().startswith('begin'):
632 if not inblock:
633 inblock=True
634 title=cl.split()[1].lower()
635 else:
636 raise NexusError('Illegal block nesting in block %s' % title)
637 elif cl.lower().startswith('end'):
638 if inblock:
639 inblock=False
640 yield title,blocklines
641 blocklines=[]
642 else:
643 raise NexusError('Unmatched \'end\'.')
644 elif inblock:
645 blocklines.append(cl)
646
652
654 """Parse a known Nexus Block (PRIVATE)."""
655
656 self._apply_block_structure(title, contents)
657
658
659 block=self.structured[-1]
660 for line in block.commandlines:
661 try:
662 getattr(self,'_'+line.command)(line.options)
663 except AttributeError:
664 raise
665 raise NexusError('Unknown command: %s ' % line.command)
666
669
671 if 'ntax' in options:
672 self.ntax=eval(options['ntax'])
673 if 'nchar' in options:
674 self.nchar=eval(options['nchar'])
675
755
756
757 - def _set(self,options):
759
761 self.options=options;
762
764 self.eliminate=options
765
767 """Get taxon labels (PRIVATE).
768
769 As the taxon names are already in the matrix, this is superfluous
770 except for transpose matrices, which are currently unsupported anyway.
771 Thus, we ignore the taxlabels command to make handling of duplicate
772 taxon names easier.
773 """
774 pass
775
776
777
778
779
780
781
782
784 """Check for presence of taxon in self.taxlabels."""
785
786
787 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
788 nexid=taxon.replace(' ','_')
789 return nextaxa.get(nexid)
790
813
817
822
824 if not self.ntax or not self.nchar:
825 raise NexusError('Dimensions must be specified before matrix!')
826 self.matrix={}
827 taxcount=0
828 first_matrix_block=True
829
830
831 lines=[l.strip() for l in options.split('\n') if l.strip()!='']
832 lineiter=iter(lines)
833 while 1:
834 try:
835 l=lineiter.next()
836 except StopIteration:
837 if taxcount<self.ntax:
838 raise NexusError('Not enough taxa in matrix.')
839 elif taxcount>self.ntax:
840 raise NexusError('Too many taxa in matrix.')
841 else:
842 break
843
844 taxcount+=1
845
846 if taxcount>self.ntax:
847 if not self.interleave:
848 raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
849 else:
850 taxcount=1
851 first_matrix_block=False
852
853 linechars=CharBuffer(l)
854 id=quotestrip(linechars.next_word())
855 l=linechars.rest().strip()
856 chars=''
857 if self.interleave:
858
859
860 if l:
861 chars=''.join(l.split())
862 else:
863 chars=''.join(lineiter.next().split())
864 else:
865
866 chars=''.join(l.split())
867 while len(chars)<self.nchar:
868 l=lineiter.next()
869 chars+=''.join(l.split())
870 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
871
872 if taxcount==1:
873 refseq=iupac_seq
874 else:
875 if self.matchchar:
876 while 1:
877 p=iupac_seq.tostring().find(self.matchchar)
878 if p==-1:
879 break
880 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
881
882 for i,c in enumerate(iupac_seq.tostring()):
883 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
884 raise NexusError('Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
885 % (id,c,l[i-10:i+10]))
886
887 if first_matrix_block:
888 self.unaltered_taxlabels.append(id)
889 id=_unique_label(self.matrix.keys(),id)
890 self.matrix[id]=iupac_seq
891 self.taxlabels.append(id)
892 else:
893
894 id=_unique_label(self.taxlabels[:taxcount-1],id)
895 taxon_present=self._check_taxlabels(id)
896 if taxon_present:
897 self.matrix[taxon_present]+=iupac_seq
898 else:
899 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
900
901 for taxon in self.matrix:
902 if len(self.matrix[taxon])!=self.nchar:
903 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
904 % (self.nchar, len(self.matrix[taxon]),taxon))
905
906 matrixkeys=self.matrix.keys()
907 matrixkeys.sort()
908 taxlabelssort=self.taxlabels[:]
909 taxlabelssort.sort()
910 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
911
931
933 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
934 self._tree(options)
935
936 - def _tree(self,options):
969
976
980
984
986 taxpartition={}
987 quotelevel=False
988 opts=CharBuffer(options)
989 name=self._name_n_vector(opts)
990 if not name:
991 raise NexusError('Formatting error in taxpartition: %s ' % options)
992
993
994
995 sub=''
996 while True:
997 w=opts.next()
998 if w is None or (w==',' and not quotelevel):
999 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
1000 taxpartition[subname]=_make_unique(subindices)
1001 sub=''
1002 if w is None:
1003 break
1004 else:
1005 if w=="'":
1006 quotelevel=not quotelevel
1007 sub+=w
1008 self.taxpartitions[name]=taxpartition
1009
1011 """Read codon positions from a codons block as written from McClade.
1012
1013 Here codonposset is just a fancy name for a character partition with
1014 the name CodonPositions and the partitions N,1,2,3
1015 """
1016
1017 prev_partitions=self.charpartitions.keys()
1018 self._charpartition(options)
1019
1020 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions]
1021 if codonname==[] or len(codonname)>1:
1022 raise NexusError('Formatting Error in codonposset: %s ' % options)
1023 else:
1024 self.codonposset=codonname[0]
1025
1028
1030 charpartition={}
1031 quotelevel=False
1032 opts=CharBuffer(options)
1033 name=self._name_n_vector(opts)
1034 if not name:
1035 raise NexusError('Formatting error in charpartition: %s ' % options)
1036
1037
1038 sub=''
1039 while True:
1040 w=opts.next()
1041 if w is None or (w==',' and not quotelevel):
1042 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1043 charpartition[subname]=_make_unique(subindices)
1044 sub=''
1045 if w is None:
1046 break
1047 else:
1048 if w=="'":
1049 quotelevel=not quotelevel
1050 sub+=w
1051 self.charpartitions[name]=charpartition
1052
1054 """Parse the taxset/charset specification (PRIVATE).
1055
1056 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3'
1057 --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1058 """
1059 opts=CharBuffer(options)
1060 name=self._name_n_vector(opts,separator=separator)
1061 indices=self._parse_list(opts,set_type=set_type)
1062 if indices is None:
1063 raise NexusError('Formatting error in line: %s ' % options)
1064 return name,indices
1065
1089
1091 """Parse a NEXUS list (PRIVATE).
1092
1093 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1094 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1095 """
1096 plain_list=[]
1097 if options_buffer.peek_nonwhitespace():
1098 try:
1099 while True:
1100 identifier=options_buffer.next_word()
1101 if not identifier:
1102 break
1103 start=self._resolve(identifier,set_type=set_type)
1104 if options_buffer.peek_nonwhitespace()=='-':
1105 end=start
1106 step=1
1107
1108 hyphen=options_buffer.next_nonwhitespace()
1109 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1110 if set_type==CHARSET:
1111 if options_buffer.peek_nonwhitespace()=='\\':
1112 backslash=options_buffer.next_nonwhitespace()
1113 step=int(options_buffer.next_word())
1114 plain_list.extend(range(start,end+1,step))
1115 else:
1116 if type(start)==list or type(end)==list:
1117 raise NexusError('Name if character sets not allowed in range definition: %s'\
1118 % identifier)
1119 start=self.taxlabels.index(start)
1120 end=self.taxlabels.index(end)
1121 taxrange=self.taxlabels[start:end+1]
1122 plain_list.extend(taxrange)
1123 else:
1124 if type(start)==list:
1125 plain_list.extend(start)
1126 else:
1127 plain_list.append(start)
1128 except NexusError:
1129 raise
1130 except:
1131 return None
1132 return plain_list
1133
1134 - def _resolve(self,identifier,set_type=None):
1135 """Translate identifier in list into character/taxon index.
1136
1137 Characters (which are referred to by their index in Nexus.py):
1138 Plain numbers are returned minus 1 (Nexus indices to python indices)
1139 Text identifiers are translaterd into their indices (if plain character indentifiers),
1140 the first hit in charlabels is returned (charlabels don't need to be unique)
1141 or the range of indices is returned (if names of character sets).
1142 Taxa (which are referred to by their unique name in Nexus.py):
1143 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1144 Names are returned unchanged (if plain taxon identifiers), or the names in
1145 the corresponding taxon set is returned
1146 """
1147 identifier=quotestrip(identifier)
1148 if not set_type:
1149 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1150 if set_type==CHARSET:
1151 try:
1152 n=int(identifier)
1153 except ValueError:
1154 if self.charlabels and identifier in self.charlabels.values():
1155 for k in self.charlabels:
1156 if self.charlabels[k]==identifier:
1157 return k
1158 elif self.charsets and identifier in self.charsets:
1159 return self.charsets[identifier]
1160 else:
1161 raise NexusError('Unknown character identifier: %s' \
1162 % identifier)
1163 else:
1164 if n<=self.nchar:
1165 return n-1
1166 else:
1167 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
1168 % (identifier,self.nchar))
1169 elif set_type==TAXSET:
1170 try:
1171 n=int(identifier)
1172 except ValueError:
1173 taxlabels_id=self._check_taxlabels(identifier)
1174 if taxlabels_id:
1175 return taxlabels_id
1176 elif self.taxsets and identifier in self.taxsets:
1177 return self.taxsets[identifier]
1178 else:
1179 raise NexusError('Unknown taxon identifier: %s' \
1180 % identifier)
1181 else:
1182 if n>0 and n<=self.ntax:
1183 return self.taxlabels[n-1]
1184 else:
1185 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
1186 % (identifier,self.ntax))
1187 else:
1188 raise NexusError('Unknown set specification: %s.'% set_type)
1189
1193
1197
1201
1205
1208 """Writes a nexus file for each partition in charpartition.
1209
1210 Only non-excluded characters and non-deleted taxa are included,
1211 just the data block is written.
1212 """
1213
1214 if not matrix:
1215 matrix=self.matrix
1216 if not matrix:
1217 return
1218 if not filename:
1219 filename=self.filename
1220 if charpartition:
1221 pfilenames={}
1222 for p in charpartition:
1223 total_exclude=[]+exclude
1224 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1225 total_exclude=_make_unique(total_exclude)
1226 pcomment=comment+'\nPartition: '+p+'\n'
1227 dot=filename.rfind('.')
1228 if dot>0:
1229 pfilename=filename[:dot]+'_'+p+'.data'
1230 else:
1231 pfilename=filename+'_'+p
1232 pfilenames[p]=pfilename
1233 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1234 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1235 mrbayes=mrbayes)
1236 return pfilenames
1237 else:
1238 fn=self.filename+'.data'
1239 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1240 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1241 mrbayes=mrbayes)
1242 return fn
1243
1244 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1245 blocksize=None, interleave=False, interleave_by_partition=False,\
1246 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1247 codons_block=True):
1248 """Writes a nexus file with data and sets block to a file or handle.
1249
1250 Character sets and partitions are appended by default, and are
1251 adjusted according to excluded characters (i.e. character sets
1252 still point to the same sites (not necessarily same positions),
1253 without including the deleted characters.
1254
1255 filename - Either a filename as a string (which will be opened,
1256 written to and closed), or a handle object (which will
1257 be written to but NOT closed).
1258 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1259 start of the file is ommited.
1260
1261 Returns the filename/handle used to write the data.
1262 """
1263 if not matrix:
1264 matrix=self.matrix
1265 if not matrix:
1266 return
1267 if not filename:
1268 filename=self.filename
1269 if [t for t in delete if not self._check_taxlabels(t)]:
1270 raise NexusError('Unknown taxa: %s' \
1271 % ', '.join(set(delete).difference(set(self.taxlabels))))
1272 if interleave_by_partition:
1273 if not interleave_by_partition in self.charpartitions:
1274 raise NexusError('Unknown partition: '+interleave_by_partition)
1275 else:
1276 partition=self.charpartitions[interleave_by_partition]
1277
1278 names=_sort_keys_by_values(partition)
1279 newpartition={}
1280 for p in partition:
1281 newpartition[p]=[c for c in partition[p] if c not in exclude]
1282
1283 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1284 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1285 ntax_adjusted=len(undelete)
1286 nchar_adjusted=len(cropped_matrix[undelete[0]])
1287 if not undelete or (undelete and undelete[0]==''):
1288 return
1289 if isinstance(filename,str):
1290 try:
1291 fh=open(filename,'w')
1292 except IOError:
1293 raise NexusError('Could not open %s for writing.' % filename)
1294 elif hasattr(file, "write"):
1295 fh=filename
1296 else :
1297 raise ValueError("Neither a filename nor a handle was supplied")
1298 if not omit_NEXUS:
1299 fh.write('#NEXUS\n')
1300 if comment:
1301 fh.write('['+comment+']\n')
1302 fh.write('begin data;\n')
1303 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1304 fh.write('\tformat datatype='+self.datatype)
1305 if self.respectcase:
1306 fh.write(' respectcase')
1307 if self.missing:
1308 fh.write(' missing='+self.missing)
1309 if self.gap:
1310 fh.write(' gap='+self.gap)
1311 if self.matchchar:
1312 fh.write(' matchchar='+self.matchchar)
1313 if self.labels:
1314 fh.write(' labels='+self.labels)
1315 if self.equate:
1316 fh.write(' equate='+self.equate)
1317 if interleave or interleave_by_partition:
1318 fh.write(' interleave')
1319 fh.write(';\n')
1320
1321
1322 if self.charlabels:
1323 newcharlabels=self._adjust_charlabels(exclude=exclude)
1324 clkeys=newcharlabels.keys()
1325 clkeys.sort()
1326 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1327 fh.write('matrix\n')
1328 if not blocksize:
1329 if interleave:
1330 blocksize=70
1331 else:
1332 blocksize=self.nchar
1333
1334 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1335 if interleave_by_partition:
1336
1337 seek=0
1338 for p in names:
1339 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1340 if len(newpartition[p])>0:
1341 for taxon in undelete:
1342 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1343 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1344 fh.write('\n')
1345 else:
1346 fh.write('[empty]\n\n')
1347 seek+=len(newpartition[p])
1348 elif interleave:
1349 for seek in range(0,nchar_adjusted,blocksize):
1350 for taxon in undelete:
1351 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1352 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1353 fh.write('\n')
1354 else:
1355 for taxon in undelete:
1356 if blocksize<nchar_adjusted:
1357 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1358 else:
1359 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1360 for seek in range(0,nchar_adjusted,blocksize):
1361 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1362 fh.write(';\nend;\n')
1363 if append_sets:
1364 if codons_block:
1365 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1367 else:
1368 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1369
1370 if fh == filename :
1371
1372 return filename
1373 else :
1374
1375 fh.close()
1376 return filename
1377
1379 """Returns a sets block."""
1380 if not self.charsets and not self.taxsets and not self.charpartitions:
1381 return ''
1382 if codons_only:
1383 setsb=['\nbegin codons']
1384 else:
1385 setsb=['\nbegin sets']
1386
1387
1388
1389
1390 offset=0
1391 offlist=[]
1392 for c in range(self.nchar):
1393 if c in exclude:
1394 offset+=1
1395 offlist.append(-1)
1396 else:
1397 offlist.append(c-offset)
1398
1399 if not codons_only:
1400 for n,ns in self.charsets.items():
1401 cset=[offlist[c] for c in ns if c not in exclude]
1402 if cset:
1403 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1404 for n,s in self.taxsets.items():
1405 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1406 if tset:
1407 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1408 for n,p in self.charpartitions.items():
1409 if not include_codons and n==CODONPOSITIONS:
1410 continue
1411 elif codons_only and n!=CODONPOSITIONS:
1412 continue
1413
1414
1415
1416 names=_sort_keys_by_values(p)
1417 newpartition={}
1418 for sn in names:
1419 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1420 if nsp:
1421 newpartition[sn]=nsp
1422 if newpartition:
1423 if include_codons and n==CODONPOSITIONS:
1424 command='codonposset'
1425 else:
1426 command='charpartition'
1427 setsb.append('%s %s = %s' % (command,safename(n),\
1428 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1429
1430 for n,p in self.taxpartitions.items():
1431 names=_sort_keys_by_values(p)
1432 newpartition={}
1433 for sn in names:
1434 nsp=[t for t in p[sn] if t not in delete]
1435 if nsp:
1436 newpartition[sn]=nsp
1437 if newpartition:
1438 setsb.append('taxpartition %s = %s' % (safename(n),\
1439 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1440
1441 setsb.append('end;\n')
1442 if len(setsb)==2:
1443 return ''
1444 else:
1445 return ';\n'.join(setsb)
1446
1448 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1449 if not filename:
1450 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1451 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1452 else:
1453 filename=self.filename+'.fas'
1454 fh=open(filename,'w')
1455 for taxon in self.taxlabels:
1456 fh.write('>'+safename(taxon)+'\n')
1457 for i in range(0, len(self.matrix[taxon].tostring()), width):
1458 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1459 fh.close()
1460 return filename
1461
1463 """Writes matrix into a PHYLIP file: (self, filename=None).
1464
1465 Note that this writes a relaxed PHYLIP format file, where the names
1466 are not truncated, nor checked for invalid characters."""
1467 if not filename:
1468 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1469 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1470 else:
1471 filename=self.filename+'.phy'
1472 fh=open(filename,'w')
1473 fh.write('%d %d\n' % (self.ntax,self.nchar))
1474 for taxon in self.taxlabels:
1475 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1476 fh.close()
1477 return filename
1478
1479 - def constant(self,matrix=None,delete=[],exclude=[]):
1480 """Return a list with all constant characters."""
1481 if not matrix:
1482 matrix=self.matrix
1483 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1484 if not undelete:
1485 return None
1486 elif len(undelete)==1:
1487 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1488
1489 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1490 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1491 for taxon in undelete[1:]:
1492 newconstant=[]
1493 for site in constant:
1494
1495 seqsite=matrix[taxon][site[0]].upper()
1496
1497 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1498
1499 newconstant.append(site)
1500 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1501
1502 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1503 elif seqsite in self.ambiguous_values:
1504 intersect=sets.set(self.ambiguous_values[seqsite]).intersection(sets.set(site[1]))
1505 if intersect:
1506 newconstant.append((site[0],''.join(intersect)))
1507
1508
1509
1510
1511
1512 constant=newconstant
1513 cpos=[s[0] for s in constant]
1514 return constant
1515
1516
1518 """Summarize character.
1519
1520 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1521 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1522 """
1523 undelete=[t for t in self.taxlabels if t not in delete]
1524 if not undelete:
1525 return None
1526 cstatus=[]
1527 for t in undelete:
1528 c=self.matrix[t][site].upper()
1529 if self.options.get('gapmode')=='missing' and c==self.gap:
1530 c=self.missing
1531 if narrow and c==self.missing:
1532 if c not in cstatus:
1533 cstatus.append(c)
1534 else:
1535 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1536 if self.missing in cstatus and narrow and len(cstatus)>1:
1537 cstatus=[c for c in cstatus if c!=self.missing]
1538 cstatus.sort()
1539 return cstatus
1540
1542 """Calculates a stepmatrix for weighted parsimony.
1543
1544 See Wheeler (1990), Cladistics 6:269-275 and
1545 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1546 """
1547 m=StepMatrix(self.unambiguous_letters,self.gap)
1548 for site in [s for s in range(self.nchar) if s not in exclude]:
1549 cstatus=self.cstatus(site,delete)
1550 for i,b1 in enumerate(cstatus[:-1]):
1551 for b2 in cstatus[i+1:]:
1552 m.add(b1.upper(),b2.upper(),1)
1553 return m.transformation().weighting().smprint(name=name)
1554
1555 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1556 """Return a matrix without deleted taxa and excluded characters."""
1557 if not matrix:
1558 matrix=self.matrix
1559 if [t for t in delete if not self._check_taxlabels(t)]:
1560 raise NexusError('Unknown taxa: %s' \
1561 % ', '.join(sets.set(delete).difference(self.taxlabels)))
1562 if exclude!=[]:
1563 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1564 if not undelete:
1565 return {}
1566 m=[matrix[k].tostring() for k in undelete]
1567 zipped_m=zip(*m)
1568 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1569 if sitesm==[]:
1570 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1571 else:
1572 zipped_sitesm=zip(*sitesm)
1573 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1574 return dict(zip(undelete,m))
1575 else:
1576 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1577
1578 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1579 """Return a bootstrapped matrix."""
1580 if not matrix:
1581 matrix=self.matrix
1582 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1583 cm=self.crop_matrix(delete=delete,exclude=exclude)
1584 if not cm:
1585 return {}
1586 elif len(cm[cm.keys()[0]])==0:
1587 return cm
1588 undelete=[t for t in self.taxlabels if t in cm]
1589 if seqobjects:
1590 sitesm=zip(*[cm[t].tostring() for t in undelete])
1591 alphabet=matrix[matrix.keys()[0]].alphabet
1592 else:
1593 sitesm=zip(*[cm[t] for t in undelete])
1594 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1595 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1596 if seqobjects:
1597 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1598 return dict(zip(undelete,bootstrapseqs))
1599
1601 """Adds a sequence (string) to the matrix."""
1602
1603 if not name:
1604 raise NexusError('New sequence must have a name')
1605
1606 diff=self.nchar-len(sequence)
1607 if diff<0:
1608 self.insert_gap(self.nchar,-diff)
1609 elif diff>0:
1610 sequence+=self.missing*diff
1611
1612 if name in self.taxlabels:
1613 unique_name=_unique_label(self.taxlabels,name)
1614
1615 else:
1616 unique_name=name
1617
1618 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1619
1620 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1621 self.ntax+=1
1622 self.taxlabels.append(unique_name)
1623 self.unaltered_taxlabels.append(name)
1624
1626 """Add a gap into the matrix and adjust charsets and partitions.
1627
1628 pos=0: first position
1629 pos=nchar: last position
1630 """
1631
1632 def _adjust(set,x,d,leftgreedy=False):
1633 """Adjusts chartacter sets if gaps are inserted, taking care of
1634 new gaps within a coherent character set."""
1635
1636
1637
1638 set.sort()
1639 addpos=0
1640 for i,c in enumerate(set):
1641 if c>=x:
1642 set[i]=c+d
1643
1644 if c==x:
1645 if leftgreedy or (i>0 and set[i-1]==c-1):
1646 addpos=i
1647 if addpos>0:
1648 set[addpos:addpos]=range(x,x+d)
1649 return set
1650
1651 if pos<0 or pos>self.nchar:
1652 raise NexusError('Illegal gap position: %d' % pos)
1653 if n==0:
1654 return
1655 if self.taxlabels :
1656
1657 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1658 else :
1659 sitesm=[]
1660 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1661
1662
1663 zipped=zip(*sitesm)
1664 mapped=map(''.join,zipped)
1665 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1666 self.matrix=dict(listed)
1667 self.nchar+=n
1668
1669 for i,s in self.charsets.items():
1670 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1671 for p in self.charpartitions:
1672 for sp,s in self.charpartitions[p].items():
1673 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1674
1675 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1676 return self.charlabels
1677
1679 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1680 if exclude and insert:
1681 raise NexusError('Can\'t exclude and insert at the same time')
1682 if not self.charlabels:
1683 return None
1684 labels=self.charlabels.keys()
1685 labels.sort()
1686 newcharlabels={}
1687 if exclude:
1688 exclude.sort()
1689 exclude.append(sys.maxint)
1690 excount=0
1691 for c in labels:
1692 if not c in exclude:
1693 while c>exclude[excount]:
1694 excount+=1
1695 newcharlabels[c-excount]=self.charlabels[c]
1696 elif insert:
1697 insert.sort()
1698 insert.append(sys.maxint)
1699 icount=0
1700 for c in labels:
1701 while c>=insert[icount]:
1702 icount+=1
1703 newcharlabels[c+icount]=self.charlabels[c]
1704 else:
1705 return self.charlabels
1706 return newcharlabels
1707
1709 """Returns all character indices that are not in charlist."""
1710 return [c for c in range(self.nchar) if c not in charlist]
1711
1713 """Return gap-only sites."""
1714 gap=set(self.gap)
1715 if include_missing:
1716 gap.add(self.missing)
1717 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1718 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
1719 return gaponly
1720
1742