1 """Hold GenBank data in a straightforward format.
2
3 classes:
4 o Record - All of the information in a GenBank record.
5 o Reference - hold reference data for a record.
6 o Feature - Hold the information in a Feature Table.
7 o Qualifier - Qualifiers on a Feature.
8 17-MAR-2009: added support for WGS and WGS_SCAFLD lines. Ying Huang & Iddo Friedberg
9 """
10
11 import Bio.GenBank
12
14 """Write a line of GenBank info that can wrap over multiple lines.
15
16 This takes a line of information which can potentially wrap over
17 multiple lines, and breaks it up with carriage returns and
18 indentation so it fits properly into a GenBank record.
19
20 Arguments:
21
22 o information - The string holding the information we want
23 wrapped in GenBank method.
24
25 o indent - The indentation on the lines we are writing.
26
27 o wrap_space - Whether or not to wrap only on spaces in the
28 information.
29
30 o split_char - A specific character to split the lines on. By default
31 spaces are used.
32 """
33 info_length = Record.GB_LINE_LENGTH - indent
34
35 if wrap_space:
36 info_parts = information.split(split_char)
37 else:
38 cur_pos = 0
39 info_parts = []
40 while cur_pos < len(information):
41 info_parts.append(information[cur_pos: cur_pos + info_length])
42 cur_pos += info_length
43
44
45 output_parts = []
46 cur_part = ""
47 for info_part in info_parts:
48 if len(cur_part) + 1 + len(info_part) > info_length:
49 if cur_part:
50 if split_char != " ":
51 cur_part += split_char
52 output_parts.append(cur_part)
53 cur_part = info_part
54 else:
55 if cur_part == "":
56 cur_part = info_part
57 else:
58 cur_part += split_char + info_part
59
60
61 if cur_part:
62 output_parts.append(cur_part)
63
64
65 output_info = output_parts[0] + "\n"
66 for output_part in output_parts[1:]:
67 output_info += " " * indent + output_part + "\n"
68
69 return output_info
70
72 """Write out information with the specified indent.
73
74 Unlike _wrapped_genbank, this function makes no attempt to wrap
75 lines -- it assumes that the information already has newlines in the
76 appropriate places, and will add the specified indent to the start of
77 each line.
78 """
79
80 info_parts = information.split("\n")
81
82
83 output_info = info_parts[0] + "\n"
84 for info_part in info_parts[1:]:
85 output_info += " " * indent + info_part + "\n"
86
87 return output_info
88
90 """Hold GenBank information in a format similar to the original record.
91
92 The Record class is meant to make data easy to get to when you are
93 just interested in looking at GenBank data.
94
95 Attributes:
96 o locus - The name specified after the LOCUS keyword in the GenBank
97 record. This may be the accession number, or a clone id or something else.
98 o size - The size of the record.
99 o residue_type - The type of residues making up the sequence in this
100 record. Normally something like RNA, DNA or PROTEIN, but may be as
101 esoteric as 'ss-RNA circular'.
102 o data_file_division - The division this record is stored under in
103 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
104 o date - The date of submission of the record, in a form like '28-JUL-1998'
105 o accession - list of all accession numbers for the sequence.
106 o nid - Nucleotide identifier number.
107 o pid - Proteint identifier number
108 o version - The accession number + version (ie. AB01234.2)
109 o db_source - Information about the database the record came from
110 o gi - The NCBI gi identifier for the record.
111 o keywords - A list of keywords related to the record.
112 o segment - If the record is one of a series, this is info about which
113 segment this record is (something like '1 of 6').
114 o source - The source of material where the sequence came from.
115 o organism - The genus and species of the organism (ie. 'Homo sapiens')
116 o taxonomy - A listing of the taxonomic classification of the organism,
117 starting general and getting more specific.
118 o references - A list of Reference objects.
119 o comment - Text with any kind of comment about the record.
120 o features - A listing of Features making up the feature table.
121 o base_counts - A string with the counts of bases for the sequence.
122 o origin - A string specifying info about the origin of the sequence.
123 o sequence - A string with the sequence itself.
124 o contig - A string of location information for a CONTIG in a RefSeq file
125 o project - The genome sequencing project numbers
126 (will be replaced by the dblink cross-references in 2009).
127 o dblinks - The genome sequencing project number(s) and other links.
128 (will replace the project information in 2009).
129 """
130
131 GB_LINE_LENGTH = 79
132 GB_BASE_INDENT = 12
133 GB_FEATURE_INDENT = 21
134 GB_INTERNAL_INDENT = 2
135 GB_OTHER_INTERNAL_INDENT = 3
136 GB_FEATURE_INTERNAL_INDENT = 5
137 GB_SEQUENCE_INDENT = 9
138
139 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
140 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
141 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
142 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
143 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
144 "s"
145
146 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
147 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
148 str(GB_FEATURE_INDENT -
149 GB_FEATURE_INTERNAL_INDENT) + "s"
150 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
151
182
223
225 """Provide the output string for the LOCUS line.
226 """
227 output = "LOCUS"
228 output += " " * 7
229 output += "%-9s" % self.locus
230 output += " "
231 output += "%7s" % self.size
232 if self.residue_type.find("PROTEIN") >= 0:
233 output += " aa"
234 else:
235 output += " bp "
236
237
238
239 if self.residue_type.find("circular") >= 0:
240 output += "%17s" % self.residue_type
241
242 elif self.residue_type.find("-") >= 0:
243 output += "%7s" % self.residue_type
244 output += " " * 10
245 else:
246 output += " " * 3
247 output += "%-4s" % self.residue_type
248 output += " " * 10
249
250 output += " " * 2
251 output += "%3s" % self.data_file_division
252 output += " " * 7
253 output += "%11s" % self.date
254 output += "\n"
255 return output
256
263
280
282 """Output for the VERSION line.
283 """
284 if self.version:
285 output = Record.BASE_FORMAT % "VERSION"
286 output += self.version
287 output += " GI:"
288 output += "%s\n" % self.gi
289 else:
290 output = ""
291 return output
292
294 output = ""
295 if len(self.projects) > 0:
296 output = Record.BASE_FORMAT % "PROJECT"
297 output += "%s\n" % " ".join(self.projects)
298 return output
299
307
309 """Output for the NID line. Use of NID is obsolete in GenBank files.
310 """
311 if self.nid:
312 output = Record.BASE_FORMAT % "NID"
313 output += "%s\n" % self.nid
314 else:
315 output = ""
316 return output
317
319 """Output for PID line. Presumedly, PID usage is also obsolete.
320 """
321 if self.pid:
322 output = Record.BASE_FORMAT % "PID"
323 output += "%s\n" % self.pid
324 else:
325 output = ""
326 return output
327
345
347 """Output for DBSOURCE line.
348 """
349 if self.db_source:
350 output = Record.BASE_FORMAT % "DBSOURCE"
351 output += "%s\n" % self.db_source
352 else:
353 output = ""
354 return output
355
364
371
388
398
400 """Output for the FEATURES line.
401 """
402 output = ""
403 if len(self.features) > 0:
404 output += Record.BASE_FEATURE_FORMAT % "FEATURES"
405 output += "Location/Qualifiers\n"
406 return output
407
409 """Output for the BASE COUNT line with base information.
410 """
411 output = ""
412 if self.base_counts:
413 output += Record.BASE_FORMAT % "BASE COUNT "
414
415 count_parts = self.base_counts.split(" ")
416 while '' in count_parts:
417 count_parts.remove('')
418
419
420 if len(count_parts) % 2 == 0:
421 while len(count_parts) > 0:
422 count_info = count_parts.pop(0)
423 count_type = count_parts.pop(0)
424
425 output += "%7s %s" % (count_info, count_type)
426
427
428
429 else:
430 output += self.base_counts
431 output += "\n"
432 return output
433
447
449 """Output for all of the sequence.
450 """
451 output = ""
452 if self.sequence:
453 cur_seq_pos = 0
454 while cur_seq_pos < len(self.sequence):
455 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
456
457 for section in range(6):
458 start_pos = cur_seq_pos + section * 10
459 end_pos = start_pos + 10
460 seq_section = self.sequence[start_pos:end_pos]
461 output += " %s" % seq_section.lower()
462
463
464 if end_pos > len(self.sequence):
465 break
466
467 output += "\n"
468 cur_seq_pos += 60
469 return output
470
477
479 output = ""
480 if self.wgs_scafld:
481 output += Record.BASE_FORMAT % "WGS_SCAFLD"
482 output += self.wgs_scafld
483 return output
484
494
495
497 """Hold information from a GenBank reference.
498
499 Attributes:
500 o number - The number of the reference in the listing of references.
501 o bases - The bases in the sequence the reference refers to.
502 o authors - String with all of the authors.
503 o consrtm - Consortium the authors belong to.
504 o title - The title of the reference.
505 o journal - Information about the journal where the reference appeared.
506 o medline_id - The medline id for the reference.
507 o pubmed_id - The pubmed_id for the reference.
508 o remark - Free-form remarks about the reference.
509 """
520
532
534 """Output for REFERENCE lines.
535 """
536 output = Record.BASE_FORMAT % "REFERENCE"
537 if self.number:
538 if self.bases:
539 output += "%-3s" % self.number
540 output += "%s" % self.bases
541 else:
542 output += "%s" % self.number
543
544 output += "\n"
545 return output
546
555
564
573
582
591
600
609
611 """Hold information about a Feature in the Feature Table of GenBank record.
612
613 Attributes:
614 o key - The key name of the featue (ie. source)
615 o location - The string specifying the location of the feature.
616 o qualfiers - A listing Qualifier objects in the feature.
617 """
619 self.key = ''
620 self.location = ''
621 self.qualifiers = []
622
640
642 """Hold information about a qualifier in a GenBank feature.
643
644 Attributes:
645 o key - The key name of the qualifier (ie. /organism=)
646 o value - The value of the qualifier ("Dictyostelium discoideum").
647 """
651