diff --git a/sequence b/sequence new file mode 100644 index 0000000000000000000000000000000000000000..16e832e4213220f16536e8f19a89593d5004fb40 --- /dev/null +++ b/sequence @@ -0,0 +1,114 @@ +import parasail + + +class Sequence: + """ + represente une sequence ADN + """ + def __init__(self, sequence: str): + """ + constructeur d'un objet Sequence + :param sequence: str, une sequence ADN + """ + self.sequence = sequence + self.taille_kmer = 0 + + def __len__(self) -> int: + """ + renvoie la taille de l'objet Sequence + :return: int + """ + return len(self.sequence) + + def suffix_array(self) -> list: + """ + construit la table des suffixes + :return: list, la table des suffixes + """ + return sorted(range(len(self.sequence)), key=lambda i: self.sequence[i:]) + + def kmers(self, k) -> list: + """ + renvoie la liste des kmers ; les kmers chevauchants sont filtrés + :param k: int, la taille des kmers + :return: list, les kmers + """ + self.taille_kmer = k + kmers = [] # dict + for i in range(0, (len(self.sequence) - k), int(k/2)): + kmers.append(i) + return kmers + + def get_kmer(self, kmer): + """ + renvoie le kmer correspondant à sa position entree en parametre + :param kmer: int, position du kmer + :return: + """ + sequence = self.sequence + return sequence[kmer:self.taille_kmer] + + def get_sequence(self) -> str: + """ + renvoie la sequencede l'objet Sequence + :return: str, la sequence + """ + return self.sequence + + +def search_kmer(kmer: list, genome: str) -> list: + """ + recherche les hits entre la liste de kmers et le genome + :param kmer: list, liste de kmers + :param genome: str, une sequence adn + :return: list, la liste des positions des hits + """ + liste_positions = [] + n = 0 + table_suffixes = genome.suffix_array() + valeur_suffixe = int(len(table_suffixes) / 2) + while kmer != genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + if kmer < genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + valeur_suffixe = int(valeur_suffixe/2) + if kmer > genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + valeur_suffixe += int((len(table_suffixes) - valeur_suffixe)/2) + if kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + val_egale = valeur_suffixe + liste_positions.append(valeur_suffixe) + if val_egale > 0 and kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + valeur_suffixe = valeur_suffixe - 1 + if kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]: + liste_positions.append(valeur_suffixe) + if val_egale < len(genome): + val_egale = val_egale + 1 + if kmer == genome.sequence[val_egale:val_egale+len(kmer)]: + liste_positions.append(val_egale) + return liste_positions + + +def align_sequences(query: str, genome: str): + """ + realise l'alignement + :param query: str, + :param genome: str, + :return: le score, l'alignement + """ + # alignement semi-global + align_sg = parasail.sg_dx_trace_scan_sat(query, genome, 5, 1, parasail.dnafull) + score = align_sg.score + traceback = align_sg.get_traceback() + + # seq align (query = read, ref = génome) + align_seq_query = traceback.query + align_seq_genome = traceback.ref + return score, align_seq_query, align_seq_genome + + +if __name__ == "__main__": + query = "ATGACCA" + genome = "ATTCATGACCAGGTGCTGTCCAGGCTGAATGACCAGGTCGGATTATAGGCTATGACCAGGATGC" + genome_objet = Sequence(genome) + query_objet = Sequence(query) + suffixe_array = genome_objet.suffix_array() + for value in suffixe_array: + print(genome[value:])