Select Git revision
Sequence.py
-
Clara Moreno authoredClara Moreno authored
Sequence.py 4.49 KiB
import parasail
class Sequence:
"""
represente une sequence ADN
"""
def __init__(self, sequence: str):
"""
constructeur d'un objet Sequence
:param sequence: str, une sequence ADN
"""
self.sequence = sequence
self.taille_kmer = 0
def __len__(self) -> int:
"""
renvoie la taille de l'objet Sequence
:return: int
"""
return len(self.sequence)
def suffix_array(self) -> list:
"""
construit la table des suffixes
:return: list, la table des suffixes
exemple :
>>> genome = "ATTCATGACCAGGTGCTGTCCAGGCTGAATGACCAGGTCGGATTATAGGCTATGACCAGGATGC"
>>> genome_obj = Sequence(genome)
>>> genome_obj.suffix_array()
[27, 54, 31, 7, 57, 46, 21, 34, 10, 44, 51, 28, 4, 60, 41, 0, 63, 56, 20, 33, 9, 3, 55, 19, 32, 8, 38, 49, 24, 15, 26, 53, 30, 6, 59, 40, 62, 48, 23, 14, 58, 39, 47, 22, 35, 11, 17, 36, 12, 45, 43, 50, 2, 18, 37, 25, 52, 29, 5, 61, 13, 16, 42, 1]
"""
return sorted(range(len(self.sequence)), key=lambda i: self.sequence[i:])
def kmers(self, k) -> list:
"""
renvoie la liste des kmers ; les kmers chevauchants sont filtrés
:param k: int, la taille des kmers
:return: list, les kmers
exemple :
>>> query = "ATGACCAATTCATGACCAGGTGCTGTCCAGGCTGAATG"
>>> query_obj = Sequence(query)
>>> query_obj.kmers(11)
[0, 5, 10, 15, 20, 25, 27]
"""
self.taille_kmer = k
kmers = [] # dict
for i in range(0, (len(self.sequence) - k), int(k/2)):
kmers.append(i)
return kmers
def get_kmer(self, kmer):
"""
renvoie le kmer correspondant à sa position entree en parametre
:param kmer: int, position du kmer
:return:
"""
sequence = self.sequence
return sequence[kmer:self.taille_kmer]
def get_sequence(self) -> str:
"""
renvoie la sequencede l'objet Sequence
:return: str, la sequence
exemple :
>>> seq = "ATGACCAATTCATGACCAGGTGCTGTCCAGGCTGAATG"
>>> seq_obj = Sequence(seq)
>>> seq_obj.get_sequence()
ATGACCAATTCATGACCAGGTGCTGTCCAGGCTGAATG
"""
return self.sequence
def search_kmer(kmer: list, genome: str) -> list:
"""
recherche les hits entre la liste de kmers et le genome
:param kmer: list, liste de kmers
:param genome: str, une sequence adn
:return: list, la liste des positions des hits
"""
liste_positions = []
n = 0
table_suffixes = genome.suffix_array()
valeur_suffixe = int(len(table_suffixes) / 2)
while kmer != genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
if kmer < genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
valeur_suffixe = int(valeur_suffixe/2)
if kmer > genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
valeur_suffixe += int((len(table_suffixes) - valeur_suffixe)/2)
if kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
val_egale = valeur_suffixe
liste_positions.append(valeur_suffixe)
if val_egale > 0 and kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
valeur_suffixe = valeur_suffixe - 1
if kmer == genome.sequence[valeur_suffixe:valeur_suffixe+len(kmer)]:
liste_positions.append(valeur_suffixe)
if val_egale < len(genome):
val_egale = val_egale + 1
if kmer == genome.sequence[val_egale:val_egale+len(kmer)]:
liste_positions.append(val_egale)
return liste_positions
def align_sequences(query: str, genome: str):
"""
realise l'alignement
:param query: str,
:param genome: str,
:return: le score, l'alignement
"""
# alignement semi-global
align_sg = parasail.sg_dx_trace_scan_sat(query, genome, 5, 1, parasail.dnafull)
score = align_sg.score
traceback = align_sg.get_traceback()
# seq align (query = read, ref = génome)
align_seq_query = traceback.query
align_seq_genome = traceback.ref
return score, align_seq_query, align_seq_genome
if __name__ == "__main__":
query = "ATGACCA"
genome = "ATTCATGACCAGGTGCTGTCCAGGCTGAATGACCAGGTCGGATTATAGGCTATGACCAGGATGC"
genome_objet = Sequence(genome)
query_objet = Sequence(query)
suffixe_array = genome_objet.suffix_array()
for value in suffixe_array:
print(genome[value:])