Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MABS-2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thomas Wacquet
MABS-2
Commits
922db3a6
Commit
922db3a6
authored
1 year ago
by
Thomas Wacquet
Browse files
Options
Downloads
Patches
Plain Diff
some changes in classes
parent
5ab332fc
Branches
main
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
main.py
+128
-63
128 additions, 63 deletions
main.py
with
128 additions
and
63 deletions
main.py
+
128
−
63
View file @
922db3a6
...
...
@@ -17,13 +17,13 @@ label = "NC_007357.1 Influenza A virus (A/Goose/Guangdong/1/96(H5N1)) polymerase
# Lectures des reads :
reads
=
{}
with
gzip
.
open
(
'
SRR10971381_1.fastq
.gz
'
,
'
rt
'
)
as
fastq
:
counter
=
0
for
i
,
record
in
enumerate
(
SeqIO
.
parse
(
fastq
,
'
fastq
'
),
start
=
1
):
reads
[
i
]
=
(
str
(
record
.
seq
))
counter
+=
1
if
counter
>=
1000
:
break
with
gzip
.
open
(
'
data/covid
.gz
'
,
'
rt
'
)
as
fastq
:
counter
=
0
for
i
,
record
in
enumerate
(
SeqIO
.
parse
(
fastq
,
'
fastq
'
),
start
=
1
):
reads
[
i
]
=
(
str
(
record
.
seq
))
counter
+=
1
if
counter
>=
1000
:
break
# Lectures de Grippe A :
genes_grippe
=
IOAlignment
(
'
data/covid.fna
'
)
...
...
@@ -35,74 +35,139 @@ genes_grippe = genes_grippe.ParseFasta() # ID long => garder l'entier en entier
#genome="BGAABCGHJ"
# Genome
## - genome
## + create_suffix_table()
## + search_suffix()
## + get_suffix()
# Reads
## +
## + split_k_mers(k)
##
class
Genome
:
def
__init__
(
self
,
seq
):
self
.
__seq
=
seq
self
.
__suffix_table
=
self
.
__create_suffix_table
(
self
)
# crée la table des indices des suffixes triés
def
__create_suffix_table
(
self
):
# génère indices de 0 à la taille du génome
indices
=
[
i
for
i
in
range
(
len
(
self
.
__seq
))]
return
sorted
(
indices
,
key
=
self
.
__get_suffix
)
# obtenir le suffixe à la position i - pour trier correctement
def
__get_suffix
(
self
,
i
):
return
self
.
__seq
[
i
:]
def
search_indice
(
self
,
seq_to_search
):
return
self
.
__search_suffix
(
seq_to_search
,
0
,
len
(
self
.
__seq
))
def
__search_suffix
(
self
,
seq_to_search
,
i1
,
i2
):
# kmer non trouvé
if
i2
-
i1
<
1
:
return
-
1
m
=
(
i1
+
i2
)
//
2
# si le kmer est trouvé au début du suffixe, renvoyer la position
if
self
.
__seq
[
self
.
__suffix_table
[
m
]:].
startswith
(
seq_to_search
):
return
self
.
__suffix_table
[
m
]
# si le kmer est plus grand que le suffixe du milieu, parcourir la droite du tableau des suffixes
if
seq_to_search
>
self
.
__seq
[
self
.
__suffix_table
[
m
]:]:
return
self
.
__search_suffix
(
self
.
__seq
,
self
.
__suffix_table
,
seq_to_search
,
m
+
1
,
i2
)
# s'il est plus petit, parcourir la gauche
else
:
return
self
.
__search_suffix
(
self
.
__seq
,
self
.
__suffix_table
,
seq_to_search
,
i1
,
m
)
# découpe le genome en n kmers de taille k
def
split_kmers
(
genome
,
k
):
i
=
0
res
=
[]
#
def split_kmers(genome, k):
#
i = 0
#
res = []
# chaque k lettres, découper un nouveau mot et le placer dans la liste des kmers
for
i
in
range
(
0
,
len
(
genome
),
k
):
res
.
append
(
genome
[
i
:
i
+
k
])
#
# chaque k lettres, découper un nouveau mot et le placer dans la liste des kmers
#
for i in range(0, len(genome), k):
#
res.append(genome[i:i+k])
# si le kmer de fin est trop petit le retirer
if
len
(
res
[
len
(
res
)
-
1
])
<
k
:
res
.
pop
()
#
# si le kmer de fin est trop petit le retirer
#
if len(res[len(res)-1]) < k:
#
res.pop()
return
res
#
return res
# crée la table des indices des suffixes triés
def
create_suffix_table
(
genome
):
#
génère indices de 0 à la taille du génome
indices
=
[
i
for
i
in
range
(
len
(
genome
))]
return
sorted
(
indices
,
key
=
get_suffix
)
#
def create_suffix_table(genome):
#
génère indices de 0 à la taille du génome
#
indices = [i for i in range(len(genome))]
#
return sorted(indices, key=get_suffix)
# obtenir le suffixe à la position i
def
get_suffix
(
i
):
return
gene_grippe
[
i
:]
def
search_suffix
(
genome
,
suffix_table
,
kmer
,
i1
,
i2
):
# kmer non trouvé
if
i2
-
i1
<
1
:
return
-
1
m
=
(
i1
+
i2
)
//
2
# si le kmer est trouvé au début du suffixe, renvoyer la position
if
genome
[
suffix_table
[
m
]:].
startswith
(
kmer
):
return
suffix_table
[
m
]
# si le kmer est plus grand que le suffixe du milieu, parcourir la droite du tableau des suffixes
if
kmer
>
genome
[
suffix_table
[
m
]:]:
return
search_suffix
(
genome
,
suffix_table
,
kmer
,
m
+
1
,
i2
)
# s'il est plus petit, parcourir la gauche
else
:
return
search_suffix
(
genome
,
suffix_table
,
kmer
,
i1
,
m
)
# def get_suffix(i):
# return gene_grippe[i:]
# def search_suffix(genome, kmer, i1, i2):
# # kmer non trouvé
# if i2 - i1 < 1:
# return -1
# m = (i1 + i2) // 2
# # si le kmer est trouvé au début du suffixe, renvoyer la position
# if genome[suffix_table[m]:].startswith(kmer):
# return suffix_table[m]
# # si le kmer est plus grand que le suffixe du milieu, parcourir la droite du tableau des suffixes
# if kmer > genome[suffix_table[m]:]:
# return search_suffix(genome, suffix_table, kmer, m+1, i2)
# # s'il est plus petit, parcourir la gauche
# else:
# return search_suffix(genome, suffix_table, kmer, i1, m)
class
Read
:
def
__init__
(
self
,
seq
):
self
.
__seq
=
seq
def
split_into_kmers
(
self
,
k
):
i
=
0
res
=
[]
# chaque k lettres, découper un nouveau mot et le placer dans la liste des kmers
for
i
in
range
(
0
,
len
(
self
.
__seq
),
k
):
res
.
append
(
self
.
__seq
[
i
:
i
+
k
])
# si le kmer de fin est trop petit le retirer
if
len
(
res
[
len
(
res
)
-
1
])
<
k
:
res
.
pop
()
return
res
#print(split_kmers(reads[1], 10))
for
read
in
reads
.
values
():
kmers
=
split_kmers
(
read
,
11
)
for
gene_grippe
in
genes_grippe
.
values
():
suffix_table
=
create_suffix_table
(
gene_grippe
)
for
kmer
in
kmers
:
pos
=
search_suffix
(
gene_grippe
,
suffix_table
,
kmer
,
0
,
len
(
suffix_table
))
if
pos
!=
-
1
:
start
=
pos
-
len
(
read
)
end
=
pos
+
len
(
read
)
# aligner le genome sur le read (les gaps du genome ne sont pas pénalisés)
# print(str(max(start,0)) + " " + str(min(end, len(gene_grippe)-1)))
# print(str(start) + " " + str(end))
al
=
parasail
.
sg_dx_scan_sat
(
read
,
gene_grippe
[
max
(
start
,
0
):
min
(
end
,
len
(
gene_grippe
)
-
1
)],
16
,
4
,
parasail
.
dnafull
)
print
(
al
.
score
/
len
(
read
))
al
=
parasail
.
sg_dx_trace_scan_sat
(
gene_grippe
[
start
:
end
],
read
,
5
,
1
,
parasail
.
dnafull
)
traceback
=
al
.
get_traceback
()
print
(
traceback
.
ref
)
print
(
"
--------------------------
"
)
print
(
traceback
.
query
)
r
=
Read
(
read
)
kmers
=
r
.
split_kmers
(
11
)
for
gene_grippe
in
genes_grippe
.
values
():
_suffix_table
=
create_suffix_table
(
gene_grippe
)
for
kmer
in
kmers
:
pos
=
search_suffix
(
gene_grippe
,
suffix_table
,
kmer
,
0
,
len
(
suffix_table
))
if
pos
!=
-
1
:
start
=
pos
-
len
(
read
)
end
=
pos
+
len
(
read
)
# aligner le genome sur le read (les gaps du genome ne sont pas pénalisés)
# print(str(max(start,0)) + " " + str(min(end, len(gene_grippe)-1)))
# print(str(start) + " " + str(end))
al
=
parasail
.
sg_dx_scan_sat
(
read
,
gene_grippe
[
max
(
start
,
0
):
min
(
end
,
len
(
gene_grippe
)
-
1
)],
16
,
4
,
parasail
.
dnafull
)
print
(
al
.
score
/
len
(
read
))
al
=
parasail
.
sg_dx_trace_scan_sat
(
gene_grippe
[
start
:
end
],
read
,
5
,
1
,
parasail
.
dnafull
)
traceback
=
al
.
get_traceback
()
print
(
traceback
.
ref
)
print
(
"
--------------------------
"
)
print
(
traceback
.
query
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment