Skip to content
Snippets Groups Projects
Commit 8e57a4d1 authored by Antaaa28's avatar Antaaa28
Browse files

maj

parent 77d319bf
No related branches found
No related tags found
No related merge requests found
......@@ -35,7 +35,7 @@ class SimpleBloomFilter:
self.size = size
self.num_hashes = num_hashes
self.bit_array = [0] * size
def _hashes(self, item):
hash_values = []
for i in range(self.num_hashes):
......@@ -58,19 +58,19 @@ class SimpleBloomFilter:
class StructureNode:
def __init__(self, bloom_filter=None):
self.bloom = bloom_filter if bloom_filter else SimpleBloomFilter()
self.left = None
self.right = None
self.datasets = [] # list of dataset names at leaf nodes
def __init__(self, bloom_filter=None):
self.bloom = bloom_filter if bloom_filter else SimpleBloomFilter()
self.left = None
self.right = None
self.datasets = [] # list of dataset names at leaf nodes
class Structure:
"""
Une structure arborescente qui organise des jeux de données (datasets) en
utilisant des filtres de Bloom pour permettre des requetes efficaces de k-mers.
"""
Une structure arborescente qui organise des jeux de données (datasets) en
utilisant des filtres de Bloom pour permettre des requetes efficaces de k-mers.
Exemples d'utilisation:
>>> datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"]
Exemples d'utilisation:
>>> datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"]
>>> kmers_dict = {
... "Dataset1": ["ACGT", "TGCA", "GCTA"], # k-mers du Dataset1
... "Dataset2": ["CGTA", "GCTA", "TACC"], # k-mers du Dataset2
......@@ -79,112 +79,109 @@ class Structure:
... }
>>> structure= Structure(datasets, kmers_dict, bloom_size=100, num_hashes=1)
>>> structure.query("TCCA") #Recherche du k-mer "GCTA"
['Dataset1', 'Dataset2'] #"GCTA" est présent dans Dataset1 et Dataset2
>>> structure.query("TCCA") # Recherche du k-mer "TCCA"
['Dataset3'] # "TCCA" est présent dans Dataset3
>>> structure.query("ACGT") # Recherche du k-mer "ACGT"
['Dataset1'] # "ACGT" est présent dans Dataset1
>>> structure.query("GGGG") # Recherche du k-mer "GGGG"
[] # "GGGG" n'est présent dans aucun dataset
>>> structure.query("TGGC") # Recherche du k-mer "TGGC"
['Dataset4'] # "TGGC" est présent dans Dataset4
>>> structure.query("CGGT") # Recherche du k-mer "CGGT"
['Dataset3'] # "CGGT" est présent dans Dataset3
"""
def __init__(self, datasets, kmers_dict, bloom_size=10000, num_hashes=3):
self.leaves = {} # maps dataset names to their Bloom filter nodes
self.root = self._build_tree(datasets, kmers_dict, bloom_size, num_hashes)
def _build_tree(self, datasets, kmers_dict, bloom_size, num_hashes):
nodes = []
# Step 1 : # Création des feuilles pour chaque jeu de données
for dataset in datasets:
bf = SimpleBloomFilter(bloom_size, num_hashes)
for kmer in kmers_dict[dataset]:
bf.add(kmer)
node = StructureNode(bf)
node.datasets = [dataset]
self.leaves[dataset] = node
nodes.append(node)
# Step 2 : Fusion itérative des nœuds pour construire l'arbre
while len(nodes) > 1:
new_nodes = []
for i in range(0, len(nodes), 2):
if i + 1 < len(nodes):
merged_bf = nodes[i].bloom.merge(nodes[i + 1].bloom)
parent = StructureNode(merged_bf)
parent.left = nodes[i]
parent.right = nodes[i + 1]
parent.datasets = nodes[i].datasets + nodes[i + 1].datasets
else:
parent = nodes[i]
new_nodes.append(parent)
nodes = new_nodes
return nodes[0] if nodes else None
def query(self, kmer):
"""
Recherche un k-mer dans l'arbre et retourne la liste des jeux de données
susceptibles de le contenir.
Exemple d'utilisation:
>>> datasets = ["DS1", "DS2", "DS3"]
>>> kmers_dict = {
... "DS1": ["AAA", "CCC"],
... "DS2": ["GGG", "TTT"],
... "DS3": ["CCC", "GGG"]
... }
>>> s = Structure(datasets, kmers_dict, bloom_size=10, num_hashes=1)
>>> sorted(s.query("CCC"))
['DS1', 'DS3']
>>> sorted(s.query("GGG"))
['DS2', 'DS3']
>>> s.query("AAA")
['DS1']
>>> s.query("TTT")
['DS2']
>>> s.query("XYZ")
[]
"""
results = []
self._query_recursive(self.root, kmer, results)
return results
def _query_recursive(self, node, kmer, results):
if node is None:
return
if node.bloom.contains(kmer): # Si le nœud est une feuille, ajouter directement les jeux de données
if node.left is None and node.right is None:
results.extend(node.datasets)
else:
self._query_recursive(node.left, kmer, results)
self._query_recursive(node.right, kmer, results)
>>> structure.query("TCCA")
['Dataset3']
>>> structure.query("ACGT")
['Dataset1']
>>> structure.query("GGGG")
[]
>>> structure.query("TGGC")
['Dataset4']
>>> structure.query("CGGT")
['Dataset3']
"""
def __init__(self, datasets, kmers_dict, bloom_size=10000, num_hashes=3):
self.leaves = {} # maps dataset names to their Bloom filter nodes
self.root = self._build_tree(datasets, kmers_dict, bloom_size, num_hashes)
def _build_tree(self, datasets, kmers_dict, bloom_size, num_hashes):
nodes = []
# Step 1 : # Création des feuilles pour chaque jeu de données
for dataset in datasets:
bf = SimpleBloomFilter(bloom_size, num_hashes)
for kmer in kmers_dict[dataset]:
bf.add(kmer)
node = StructureNode(bf)
node.datasets = [dataset]
self.leaves[dataset] = node
nodes.append(node)
# Step 2 : Fusion itérative des nœuds pour construire l'arbre
while len(nodes) > 1:
new_nodes = []
for i in range(0, len(nodes), 2):
if i + 1 < len(nodes):
merged_bf = nodes[i].bloom.merge(nodes[i + 1].bloom)
parent = StructureNode(merged_bf)
parent.left = nodes[i]
parent.right = nodes[i + 1]
parent.datasets = nodes[i].datasets + nodes[i + 1].datasets
else:
parent = nodes[i]
new_nodes.append(parent)
nodes = new_nodes
return nodes[0] if nodes else None
def query(self, kmer):
"""
Recherche un k-mer dans l'arbre et retourne la liste des jeux de données
susceptibles de le contenir.
Exemple d'utilisation:
>>> datasets = ["DS1", "DS2", "DS3"]
>>> kmers_dict = {
... "DS1": ["AAA", "CCC"],
... "DS2": ["GGG", "TTT"],
... "DS3": ["CCC", "GGG"]
... }
>>> s = Structure(datasets, kmers_dict, bloom_size=10, num_hashes=1)
>>> sorted(s.query("CCC"))
['DS1', 'DS3']
>>> sorted(s.query("GGG"))
['DS2', 'DS3']
>>> s.query("AAA")
['DS1']
>>> s.query("TTT")
['DS2']
>>> s.query("XYZ")
[]
"""
results = []
self._query_recursive(self.root, kmer, results)
return results
def _query_recursive(self, node, kmer, results):
if node is None:
return
if node.bloom.contains(kmer): # Si le nœud est une feuille, ajouter directement les jeux de données
if node.left is None and node.right is None:
results.extend(node.datasets)
else:
self._query_recursive(node.left, kmer, results)
self._query_recursive(node.right, kmer, results)
datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"]
kmers_dict = {
"Dataset1": ["ACGT", "TGCA", "GCTA"],
"Dataset2": ["CGTA", "GCTA", "TACC"],
"Dataset3": ["AAGT", "TCCA", "CGGT"],
"Dataset4": ["TGGC", "GGCA", "CCAA"]
"Dataset1": ["ACGT", "TGCA", "GCTA"],
"Dataset2": ["CGTA", "GCTA", "TACC"],
"Dataset3": ["AAGT", "TCCA", "CGGT"],
"Dataset4": ["TGGC", "GGCA", "CCAA"]
}
#test
structure = Structure(datasets, kmers_dict, bloom_size=100, num_hashes=1)
query_kmers = ["GCTA", "TCCA", "ACGT", "GGGG"]
for kmer in query_kmers:
result = structure.query(kmer)
print(f"K-mer '{kmer}' found in datasets: {result}")
result = structure.query(kmer)
print(f"K-mer '{kmer}' found in datasets: {result}")
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment