Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  2   
  3  This class contains Schema which deal with Motifs and Signatures at 
  4  a higher level, by introducing `don't care` (ambiguity) symbols into 
  5  the sequences. For instance, you could combine the following Motifs: 
  6   
  7  'GATC', 'GATG', 'GATG', 'GATT' 
  8   
  9  as all falling under a schema like 'GAT*', where the star indicates a 
 10  character can be anything. This helps us condense a whole ton of 
 11  motifs or signatures. 
 12  """ 
 13  # standard modules 
 14  import random 
 15  import re 
 16   
 17  # biopython 
 18  from Bio import Alphabet 
 19  from Bio.Seq import MutableSeq 
 20   
 21  # neural network libraries 
 22  from Pattern import PatternRepository 
 23   
 24  # genetic algorithm libraries 
 25  from Bio.GA import Organism 
 26  from Bio.GA.Evolver import GenerationEvolver 
 27  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 28  from Bio.GA.Crossover.Point import SinglePointCrossover 
 29  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 30  from Bio.GA.Selection.Tournament import TournamentSelection 
 31  from Bio.GA.Selection.Diversity import DiversitySelection 
 32   
33 -class Schema:
34 """Deal with motifs that have ambiguity characters in it. 35 36 This motif class allows specific ambiguity characters and tries to 37 speed up finding motifs using regular expressions. 38 39 This is likely to be a replacement for the Schema representation, 40 since it allows multiple ambiguity characters to be used. 41 """
42 - def __init__(self, ambiguity_info):
43 """Initialize with ambiguity information. 44 45 Arguments: 46 47 o ambiguity_info - A dictionary which maps letters in the motifs to 48 the ambiguous characters which they might represent. For example, 49 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G. 50 All letters in the motif must be represented in the ambiguity_info 51 dictionary. 52 """ 53 self._ambiguity_info = ambiguity_info 54 55 # a cache of all encoded motifs 56 self._motif_cache = {}
57
58 - def encode_motif(self, motif):
59 """Encode the passed motif as a regular expression pattern object. 60 61 Arguments: 62 63 o motif - The motif we want to encode. This should be a string. 64 65 Returns: 66 A compiled regular expression pattern object that can be used 67 for searching strings. 68 """ 69 regexp_string = "" 70 71 for motif_letter in motif: 72 try: 73 letter_matches = self._ambiguity_info[motif_letter] 74 except KeyError: 75 raise KeyError("No match information for letter %s" 76 % motif_letter) 77 78 if len(letter_matches) > 1: 79 regexp_match = "[" + letter_matches + "]" 80 elif len(letter_matches) == 1: 81 regexp_match = letter_matches 82 else: 83 raise ValueError("Unexpected match information %s" 84 % letter_matches) 85 86 regexp_string += regexp_match 87 88 return re.compile(regexp_string)
89
90 - def find_ambiguous(self, motif):
91 """Return the location of ambiguous items in the motif. 92 93 This just checks through the motif and compares each letter 94 against the ambiguity information. If a letter stands for multiple 95 items, it is ambiguous. 96 """ 97 ambig_positions = [] 98 for motif_letter_pos in range(len(motif)): 99 motif_letter = motif[motif_letter_pos] 100 try: 101 letter_matches = self._ambiguity_info[motif_letter] 102 except KeyError: 103 raise KeyError("No match information for letter %s" 104 % motif_letter) 105 106 if len(letter_matches) > 1: 107 ambig_positions.append(motif_letter_pos) 108 109 return ambig_positions
110
111 - def num_ambiguous(self, motif):
112 """Return the number of ambiguous letters in a given motif. 113 """ 114 ambig_positions = self.find_ambiguous(motif) 115 return len(ambig_positions)
116
117 - def find_matches(self, motif, query):
118 """Return all non-overlapping motif matches in the query string. 119 120 This utilizes the regular expression findall function, and will 121 return a list of all non-overlapping occurances in query that 122 match the ambiguous motif. 123 """ 124 try: 125 motif_pattern = self._motif_cache[motif] 126 except KeyError: 127 motif_pattern = self.encode_motif(motif) 128 self._motif_cache[motif] = motif_pattern 129 130 return motif_pattern.findall(query)
131
132 - def num_matches(self, motif, query):
133 """Find the number of non-overlapping times motif occurs in query. 134 """ 135 all_matches = self.find_matches(motif, query) 136 return len(all_matches)
137
138 - def all_unambiguous(self):
139 """Return a listing of all unambiguous letters allowed in motifs. 140 """ 141 all_letters = self._ambiguity_info.keys() 142 all_letters.sort() 143 144 unambig_letters = [] 145 146 for letter in all_letters: 147 possible_matches = self._ambiguity_info[letter] 148 if len(possible_matches) == 1: 149 unambig_letters.append(letter) 150 151 return unambig_letters
152 153 # --- helper classes and functions for the default SchemaFinder 154 155 # -- Alphabets 156
157 -class SchemaDNAAlphabet(Alphabet.Alphabet):
158 """Alphabet of a simple Schema for DNA sequences. 159 160 This defines a simple alphabet for DNA sequences that has a single 161 character which can match any other character. 162 163 o G,A,T,C - The standard unambiguous DNA alphabet. 164 165 o * - Any letter 166 """ 167 letters = ["G", "A", "T", "C", "*"] 168 169 alphabet_matches = {"G" : "G", 170 "A" : "A", 171 "T" : "T", 172 "C" : "C", 173 "*" : "GATC"}
174 175 # -- GA schema finder 176
177 -class GeneticAlgorithmFinder:
178 """Find schemas using a genetic algorithm approach. 179 180 This approach to finding schema uses Genetic Algorithms to evolve 181 a set of schema and find the best schema for a specific set of 182 records. 183 184 The 'default' finder searches for ambiguous DNA elements. This 185 can be overridden easily by creating a GeneticAlgorithmFinder 186 with a different alphabet. 187 """
188 - def __init__(self, alphabet = SchemaDNAAlphabet()):
189 """Initialize a finder to get schemas using Genetic Algorithms. 190 191 Arguments: 192 193 o alphabet -- The alphabet which specifies the contents of the 194 schemas we'll be generating. This alphabet must contain the 195 attribute 'alphabet_matches', which is a dictionary specifying 196 the potential ambiguities of each letter in the alphabet. These 197 ambiguities will be used in building up the schema. 198 """ 199 self.alphabet = alphabet 200 201 self.initial_population = 500 202 self.min_generations = 10 203 204 self._set_up_genetic_algorithm()
205
207 """Overrideable function to set up the genetic algorithm parameters. 208 209 This functions sole job is to set up the different genetic 210 algorithm functionality. Since this can be quite complicated, this 211 allows cusotmizablity of all of the parameters. If you want to 212 customize specially, you can inherit from this class and override 213 this function. 214 """ 215 self.motif_generator = RandomMotifGenerator(self.alphabet) 216 217 self.mutator = SinglePositionMutation(mutation_rate = 0.1) 218 self.crossover = SinglePointCrossover(crossover_prob = 0.25) 219 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 220 4) 221 self.base_selector = TournamentSelection(self.mutator, self.crossover, 222 self.repair, 2) 223 self.selector = DiversitySelection(self.base_selector, 224 self.motif_generator.random_motif)
225
226 - def find_schemas(self, fitness, num_schemas):
227 """Find the given number of unique schemas using a genetic algorithm 228 229 Arguments: 230 231 o fitness - A callable object (ie. function) which will evaluate 232 the fitness of a motif. 233 234 o num_schemas - The number of unique schemas with good fitness 235 that we want to generate. 236 """ 237 start_population = \ 238 Organism.function_population(self.motif_generator.random_motif, 239 self.initial_population, 240 fitness) 241 finisher = SimpleFinisher(num_schemas, self.min_generations) 242 243 # set up the evolver and do the evolution 244 evolver = GenerationEvolver(start_population, self.selector) 245 evolved_pop = evolver.evolve(finisher.is_finished) 246 247 # convert the evolved population into a PatternRepository 248 schema_info = {} 249 for org in evolved_pop: 250 # convert the Genome from a MutableSeq to a Seq so that 251 # the schemas are just strings (and not array("c")s) 252 seq_genome = org.genome.toseq() 253 schema_info[seq_genome.data] = org.fitness 254 255 return PatternRepository(schema_info)
256 257 # -- fitness classes 258
259 -class DifferentialSchemaFitness:
260 """Calculate fitness for schemas that differentiate between sequences. 261 """
262 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
263 """Initialize with different sequences to evaluate 264 265 Arguments: 266 267 o positive_seq - A list of SeqRecord objects which are the 'positive' 268 sequences -- the ones we want to select for. 269 270 o negative_seq - A list of SeqRecord objects which are the 'negative' 271 sequences that we want to avoid selecting. 272 273 o schema_evaluator - An Schema class which can be used to 274 evaluate find motif matches in sequences. 275 """ 276 self._pos_seqs = positive_seqs 277 self._neg_seqs = negative_seqs 278 self._schema_eval = schema_evaluator
279
280 - def calculate_fitness(self, genome):
281 """Calculate the fitness for a given schema. 282 283 Fitness is specified by the number of occurances of the schema in 284 the positive sequences minus the number of occurances in the 285 negative examples. 286 287 This fitness is then modified by multiplying by the length of the 288 schema and then dividing by the number of ambiguous characters in 289 the schema. This helps select for schema which are longer and have 290 less redundancy. 291 """ 292 # convert the genome into a string 293 seq_motif = genome.toseq() 294 motif = seq_motif.data 295 296 # get the counts in the positive examples 297 num_pos = 0 298 for seq_record in self._pos_seqs: 299 cur_counts = self._schema_eval.num_matches(motif, 300 seq_record.seq.data) 301 num_pos += cur_counts 302 303 # get the counts in the negative examples 304 num_neg = 0 305 for seq_record in self._neg_seqs: 306 cur_counts = self._schema_eval.num_matches(motif, 307 seq_record.seq.data) 308 309 num_neg += cur_counts 310 311 num_ambiguous = self._schema_eval.num_ambiguous(motif) 312 # weight the ambiguous stuff more highly 313 num_ambiguous = pow(2.0, num_ambiguous) 314 # increment num ambiguous to prevent division by zero errors. 315 num_ambiguous += 1 316 317 motif_size = len(motif) 318 motif_size = motif_size * 4.0 319 320 discerning_power = num_pos - num_neg 321 322 diff = (discerning_power * motif_size) / float(num_ambiguous) 323 return diff
324
325 -class MostCountSchemaFitness:
326 """Calculate a fitness giving weight to schemas that match many times. 327 328 This fitness function tries to maximize schemas which are found many 329 times in a group of sequences. 330 """
331 - def __init__(self, seq_records, schema_evaluator):
332 """Initialize with sequences to evaluate. 333 334 Arguments: 335 336 o seq_records -- A set of SeqRecord objects which we use to 337 calculate the fitness. 338 339 o schema_evaluator - An Schema class which can be used to 340 evaluate find motif matches in sequences. 341 """ 342 self._records = seq_records 343 self._evaluator = schema_evaluator
344
345 - def calculate_fitness(self, genome):
346 """Calculate the fitness of a genome based on schema matches. 347 348 This bases the fitness of a genome completely on the number of times 349 it matches in the set of seq_records. Matching more times gives a 350 better fitness 351 """ 352 # convert the genome into a string 353 seq_motif = genome.toseq() 354 motif = seq_motif.data 355 356 # find the number of times the genome matches 357 num_times = 0 358 for seq_record in self._records: 359 cur_counts = self._evaluator.num_matches(motif, 360 seq_record.seq.data) 361 num_times += cur_counts 362 363 return num_times
364 365 # -- Helper classes
366 -class RandomMotifGenerator:
367 """Generate a random motif within given parameters. 368 """
369 - def __init__(self, alphabet, min_size = 12, max_size = 17):
370 """Initialize with the motif parameters. 371 372 Arguments: 373 374 o alphabet - An alphabet specifying what letters can be inserted in 375 a motif. 376 377 o min_size, max_size - Specify the range of sizes for motifs. 378 """ 379 self._alphabet = alphabet 380 self._min_size = min_size 381 self._max_size = max_size
382
383 - def random_motif(self):
384 """Create a random motif within the given parameters. 385 386 This returns a single motif string with letters from the given 387 alphabet. The size of the motif will be randomly chosen between 388 max_size and min_size. 389 """ 390 motif_size = random.randrange(self._min_size, self._max_size) 391 392 motif = "" 393 for letter_num in range(motif_size): 394 cur_letter = random.choice(self._alphabet.letters) 395 motif += cur_letter 396 397 return MutableSeq(motif, self._alphabet)
398
399 -class SimpleFinisher:
400 """Determine when we are done evolving motifs. 401 402 This takes the very simple approach of halting evolution when the 403 GA has proceeded for a specified number of generations and has 404 a given number of unique schema with positive fitness. 405 """
406 - def __init__(self, num_schemas, min_generations = 100):
407 """Initialize the finisher with its parameters. 408 409 Arguments: 410 411 o num_schemas -- the number of useful (positive fitness) schemas 412 we want to generation 413 414 o min_generations -- The minimum number of generations to allow 415 the GA to proceed. 416 """ 417 self.num_generations = 0 418 419 self.num_schemas = num_schemas 420 self.min_generations = min_generations
421
422 - def is_finished(self, organisms):
423 """Determine when we can stop evolving the population. 424 """ 425 self.num_generations += 1 426 # print "generation %s" % self.num_generations 427 428 if self.num_generations >= self.min_generations: 429 all_seqs = [] 430 for org in organisms: 431 if org.fitness > 0: 432 if org.genome not in all_seqs: 433 all_seqs.append(org.genome) 434 435 if len(all_seqs) >= self.num_schemas: 436 return 1 437 438 return 0
439 # --- 440
441 -class SchemaFinder:
442 """Find schema in a set of sequences using a genetic algorithm approach. 443 444 Finding good schemas is very difficult because it takes forever to 445 enumerate all of the potential schemas. This finder using a genetic 446 algorithm approach to evolve good schema which match many times in 447 a set of sequences. 448 449 The default implementation of the finder is ready to find schemas 450 in a set of DNA sequences, but the finder can be customized to deal 451 with any type of data. 452 """
453 - def __init__(self, num_schemas = 100, 454 schema_finder = GeneticAlgorithmFinder()):
455 self.num_schemas = num_schemas 456 self._finder = schema_finder 457 458 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
459
460 - def find(self, seq_records):
461 """Find well-represented schemas in the given set of SeqRecords. 462 """ 463 fitness_evaluator = MostCountSchemaFitness(seq_records, 464 self.evaluator) 465 466 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 467 self.num_schemas)
468
469 - def find_differences(self, first_records, second_records):
470 """Find schemas which differentiate between the two sets of SeqRecords. 471 """ 472 fitness_evaluator = DifferentialSchemaFitness(first_records, 473 second_records, 474 self.evaluator) 475 476 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 477 self.num_schemas)
478
479 -class SchemaCoder:
480 """Convert a sequence into a representation of ambiguous motifs (schemas). 481 482 This takes a sequence, and returns the number of times specified 483 motifs are found in the sequence. This lets you represent a sequence 484 as just a count of (possibly ambiguous) motifs. 485 """
486 - def __init__(self, schemas, ambiguous_converter):
487 """Initialize the coder to convert sequences 488 489 Arguments: 490 491 o schema - A list of all of the schemas we want to search for 492 in input sequences. 493 494 o ambiguous_converter - An Schema class which can be 495 used to convert motifs into regular expressions for searching. 496 """ 497 self._schemas = schemas 498 self._converter = ambiguous_converter
499
500 - def representation(self, sequence):
501 """Represent the given input sequence as a bunch of motif counts. 502 503 Arguments: 504 505 o sequence - A Bio.Seq object we are going to represent as schemas. 506 507 This takes the sequence, searches for the motifs within it, and then 508 returns counts specifying the relative number of times each motifs 509 was found. The frequencies are in the order the original motifs were 510 passed into the initializer. 511 """ 512 schema_counts = [] 513 514 for schema in self._schemas: 515 num_counts = self._converter.num_matches(schema, sequence.data) 516 schema_counts.append(num_counts) 517 518 # normalize the counts to go between zero and one 519 min_count = 0 520 max_count = max(schema_counts) 521 522 # only normalize if we've actually found something, otherwise 523 # we'll just return 0 for everything 524 if max_count > 0: 525 for count_num in range(len(schema_counts)): 526 schema_counts[count_num] = (float(schema_counts[count_num]) - 527 float(min_count)) / float(max_count) 528 529 return schema_counts
530
531 -def matches_schema(pattern, schema, ambiguity_character = '*'):
532 """Determine whether or not the given pattern matches the schema. 533 534 Arguments: 535 536 o pattern - A string representing the pattern we want to check for 537 matching. This pattern can contain ambiguity characters (which are 538 assumed to be the same as those in the schema). 539 540 o schema - A string schema with ambiguity characters. 541 542 o ambiguity_character - The character used for ambiguity in the schema. 543 """ 544 if len(pattern) != len(schema): 545 return 0 546 547 # check each position, and return a non match if the schema and pattern 548 # are non ambiguous and don't match 549 for pos in range(len(pattern)): 550 if (schema[pos] != ambiguity_character and 551 pattern[pos] != ambiguity_character and 552 pattern[pos] != schema[pos]): 553 554 return 0 555 556 return 1
557
558 -class SchemaFactory:
559 """Generate Schema from inputs of Motifs or Signatures. 560 """
561 - def __init__(self, ambiguity_symbol = '*'):
562 """Initialize the SchemaFactory 563 564 Arguments: 565 566 o ambiguity_symbol -- The symbol to use when specifying that 567 a position is arbitrary. 568 """ 569 self._ambiguity_symbol = ambiguity_symbol
570
571 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
572 """Generate schema from a list of motifs. 573 574 Arguments: 575 576 o motif_repository - A MotifRepository class that has all of the 577 motifs we want to convert to Schema. 578 579 o motif_percent - The percentage of motifs in the motif bank which 580 should be matches. We'll try to create schema that match this 581 percentage of motifs. 582 583 o num_ambiguous - The number of ambiguous characters to include 584 in each schema. The positions of these ambiguous characters will 585 be randomly selected. 586 """ 587 # get all of the motifs we can deal with 588 all_motifs = motif_repository.get_top_percentage(motif_percent) 589 590 # start building up schemas 591 schema_info = {} 592 # continue until we've built schema matching the desired percentage 593 # of motifs 594 total_count = self._get_num_motifs(motif_repository, all_motifs) 595 matched_count = 0 596 assert total_count > 0, "Expected to have motifs to match" 597 while (float(matched_count) / float(total_count)) < motif_percent: 598 599 new_schema, matching_motifs = \ 600 self._get_unique_schema(schema_info.keys(), 601 all_motifs, num_ambiguous) 602 603 # get the number of counts for the new schema and clean up 604 # the motif list 605 schema_counts = 0 606 for motif in matching_motifs: 607 # get the counts for the motif 608 schema_counts += motif_repository.count(motif) 609 610 # remove the motif from the motif list since it is already 611 # represented by this schema 612 all_motifs.remove(motif) 613 614 615 # all the schema info 616 schema_info[new_schema] = schema_counts 617 618 matched_count += schema_counts 619 620 # print "percentage:", float(matched_count) / float(total_count) 621 622 return PatternRepository(schema_info)
623
624 - def _get_num_motifs(self, repository, motif_list):
625 """Return the number of motif counts for the list of motifs. 626 """ 627 motif_count = 0 628 for motif in motif_list: 629 motif_count += repository.count(motif) 630 631 return motif_count
632
633 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
634 """Retrieve a unique schema from a motif. 635 636 We don't want to end up with schema that match the same thing, 637 since this could lead to ambiguous results, and be messy. This 638 tries to create schema, and checks that they do not match any 639 currently existing schema. 640 """ 641 # create a schema starting with a random motif 642 # we'll keep doing this until we get a completely new schema that 643 # doesn't match any old schema 644 num_tries = 0 645 646 while 1: 647 # pick a motif to work from and make a schema from it 648 cur_motif = random.choice(motif_list) 649 650 num_tries += 1 651 652 new_schema, matching_motifs = \ 653 self._schema_from_motif(cur_motif, motif_list, 654 num_ambiguous) 655 656 has_match = 0 657 for old_schema in cur_schemas: 658 if matches_schema(new_schema, old_schema, 659 self._ambiguity_symbol): 660 has_match = 1 661 662 # if the schema doesn't match any other schema we've got 663 # a good one 664 if not(has_match): 665 break 666 667 # check for big loops in which we can't find a new schema 668 assert num_tries < 150, \ 669 "Could not generate schema in %s tries from %s with %s" \ 670 % (num_tries, motif_list, cur_schemas) 671 672 return new_schema, matching_motifs
673
674 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
675 """Create a schema from a given starting motif. 676 677 Arguments: 678 679 o motif - A motif with the pattern we will start from. 680 681 o motif_list - The total motifs we have.to match to. 682 683 o num_ambiguous - The number of ambiguous characters that should 684 be present in the schema. 685 686 Returns: 687 688 o A string representing the newly generated schema. 689 690 o A list of all of the motifs in motif_list that match the schema. 691 """ 692 assert motif in motif_list, \ 693 "Expected starting motif present in remaining motifs." 694 695 # convert random positions in the motif to ambiguous characters 696 # convert the motif into a list of characters so we can manipulate it 697 new_schema_list = list(motif) 698 for add_ambiguous in range(num_ambiguous): 699 # add an ambiguous position in a new place in the motif 700 while 1: 701 ambig_pos = random.choice(range(len(new_schema_list))) 702 703 # only add a position if it isn't already ambiguous 704 # otherwise, we'll try again 705 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 706 new_schema_list[ambig_pos] = self._ambiguity_symbol 707 break 708 709 # convert the schema back to a string 710 new_schema = ''.join(new_schema_list) 711 712 # get the motifs that the schema matches 713 matched_motifs = [] 714 for motif in motif_list: 715 if matches_schema(motif, new_schema, self._ambiguity_symbol): 716 matched_motifs.append(motif) 717 718 return new_schema, matched_motifs
719
720 - def from_signatures(self, signature_repository, num_ambiguous):
721 raise NotImplementedError("Still need to code this.")
722