diff --git a/querykmers_tpmiso.py b/querykmers_tpmiso.py index 16dbf6824f68920c9c88f01cc9690dac7d4258ab..a4c69bb3f0984c1142e1a51ddf2b7bb32ccb1def 100644 --- a/querykmers_tpmiso.py +++ b/querykmers_tpmiso.py @@ -35,7 +35,7 @@ class SimpleBloomFilter: self.size = size self.num_hashes = num_hashes self.bit_array = [0] * size - + def _hashes(self, item): hash_values = [] for i in range(self.num_hashes): @@ -58,19 +58,19 @@ class SimpleBloomFilter: class StructureNode: - def __init__(self, bloom_filter=None): - self.bloom = bloom_filter if bloom_filter else SimpleBloomFilter() - self.left = None - self.right = None - self.datasets = [] # list of dataset names at leaf nodes + def __init__(self, bloom_filter=None): + self.bloom = bloom_filter if bloom_filter else SimpleBloomFilter() + self.left = None + self.right = None + self.datasets = [] # list of dataset names at leaf nodes class Structure: - """ - Une structure arborescente qui organise des jeux de données (datasets) en - utilisant des filtres de Bloom pour permettre des requetes efficaces de k-mers. + """ + Une structure arborescente qui organise des jeux de données (datasets) en + utilisant des filtres de Bloom pour permettre des requetes efficaces de k-mers. - Exemples d'utilisation: - >>> datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"] + Exemples d'utilisation: + >>> datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"] >>> kmers_dict = { ... "Dataset1": ["ACGT", "TGCA", "GCTA"], # k-mers du Dataset1 ... "Dataset2": ["CGTA", "GCTA", "TACC"], # k-mers du Dataset2 @@ -79,112 +79,109 @@ class Structure: ... } >>> structure= Structure(datasets, kmers_dict, bloom_size=100, num_hashes=1) - >>> structure.query("TCCA") #Recherche du k-mer "GCTA" - ['Dataset1', 'Dataset2'] #"GCTA" est présent dans Dataset1 et Dataset2 - - >>> structure.query("TCCA") # Recherche du k-mer "TCCA" - ['Dataset3'] # "TCCA" est présent dans Dataset3 - - >>> structure.query("ACGT") # Recherche du k-mer "ACGT" - ['Dataset1'] # "ACGT" est présent dans Dataset1 - - >>> structure.query("GGGG") # Recherche du k-mer "GGGG" - [] # "GGGG" n'est présent dans aucun dataset - - >>> structure.query("TGGC") # Recherche du k-mer "TGGC" - ['Dataset4'] # "TGGC" est présent dans Dataset4 - - >>> structure.query("CGGT") # Recherche du k-mer "CGGT" - ['Dataset3'] # "CGGT" est présent dans Dataset3 - """ - def __init__(self, datasets, kmers_dict, bloom_size=10000, num_hashes=3): - self.leaves = {} # maps dataset names to their Bloom filter nodes - self.root = self._build_tree(datasets, kmers_dict, bloom_size, num_hashes) - - def _build_tree(self, datasets, kmers_dict, bloom_size, num_hashes): - nodes = [] - - # Step 1 : # Création des feuilles pour chaque jeu de données - for dataset in datasets: - bf = SimpleBloomFilter(bloom_size, num_hashes) - for kmer in kmers_dict[dataset]: - bf.add(kmer) - node = StructureNode(bf) - node.datasets = [dataset] - self.leaves[dataset] = node - nodes.append(node) - - # Step 2 : Fusion itérative des nœuds pour construire l'arbre - while len(nodes) > 1: - new_nodes = [] - for i in range(0, len(nodes), 2): - if i + 1 < len(nodes): - merged_bf = nodes[i].bloom.merge(nodes[i + 1].bloom) - parent = StructureNode(merged_bf) - parent.left = nodes[i] - parent.right = nodes[i + 1] - parent.datasets = nodes[i].datasets + nodes[i + 1].datasets - else: - parent = nodes[i] - new_nodes.append(parent) - nodes = new_nodes - - return nodes[0] if nodes else None - - def query(self, kmer): - """ - Recherche un k-mer dans l'arbre et retourne la liste des jeux de données - susceptibles de le contenir. - - Exemple d'utilisation: - >>> datasets = ["DS1", "DS2", "DS3"] - >>> kmers_dict = { - ... "DS1": ["AAA", "CCC"], - ... "DS2": ["GGG", "TTT"], - ... "DS3": ["CCC", "GGG"] - ... } - >>> s = Structure(datasets, kmers_dict, bloom_size=10, num_hashes=1) - >>> sorted(s.query("CCC")) - ['DS1', 'DS3'] - >>> sorted(s.query("GGG")) - ['DS2', 'DS3'] - >>> s.query("AAA") - ['DS1'] - >>> s.query("TTT") - ['DS2'] - >>> s.query("XYZ") - [] - """ - - results = [] - self._query_recursive(self.root, kmer, results) - return results - - def _query_recursive(self, node, kmer, results): - if node is None: - return - if node.bloom.contains(kmer): # Si le nœud est une feuille, ajouter directement les jeux de données - if node.left is None and node.right is None: - results.extend(node.datasets) - else: - self._query_recursive(node.left, kmer, results) - self._query_recursive(node.right, kmer, results) + >>> structure.query("TCCA") + ['Dataset3'] + + >>> structure.query("ACGT") + ['Dataset1'] + + >>> structure.query("GGGG") + [] + + >>> structure.query("TGGC") + ['Dataset4'] + + >>> structure.query("CGGT") + ['Dataset3'] + """ + def __init__(self, datasets, kmers_dict, bloom_size=10000, num_hashes=3): + self.leaves = {} # maps dataset names to their Bloom filter nodes + self.root = self._build_tree(datasets, kmers_dict, bloom_size, num_hashes) + + def _build_tree(self, datasets, kmers_dict, bloom_size, num_hashes): + nodes = [] + + # Step 1 : # Création des feuilles pour chaque jeu de données + for dataset in datasets: + bf = SimpleBloomFilter(bloom_size, num_hashes) + for kmer in kmers_dict[dataset]: + bf.add(kmer) + node = StructureNode(bf) + node.datasets = [dataset] + self.leaves[dataset] = node + nodes.append(node) + + # Step 2 : Fusion itérative des nœuds pour construire l'arbre + while len(nodes) > 1: + new_nodes = [] + for i in range(0, len(nodes), 2): + if i + 1 < len(nodes): + merged_bf = nodes[i].bloom.merge(nodes[i + 1].bloom) + parent = StructureNode(merged_bf) + parent.left = nodes[i] + parent.right = nodes[i + 1] + parent.datasets = nodes[i].datasets + nodes[i + 1].datasets + else: + parent = nodes[i] + new_nodes.append(parent) + nodes = new_nodes + + return nodes[0] if nodes else None + + def query(self, kmer): + """ + Recherche un k-mer dans l'arbre et retourne la liste des jeux de données + susceptibles de le contenir. + + Exemple d'utilisation: + >>> datasets = ["DS1", "DS2", "DS3"] + >>> kmers_dict = { + ... "DS1": ["AAA", "CCC"], + ... "DS2": ["GGG", "TTT"], + ... "DS3": ["CCC", "GGG"] + ... } + >>> s = Structure(datasets, kmers_dict, bloom_size=10, num_hashes=1) + >>> sorted(s.query("CCC")) + ['DS1', 'DS3'] + >>> sorted(s.query("GGG")) + ['DS2', 'DS3'] + >>> s.query("AAA") + ['DS1'] + >>> s.query("TTT") + ['DS2'] + >>> s.query("XYZ") + [] + """ + + results = [] + self._query_recursive(self.root, kmer, results) + return results + + def _query_recursive(self, node, kmer, results): + if node is None: + return + if node.bloom.contains(kmer): # Si le nœud est une feuille, ajouter directement les jeux de données + if node.left is None and node.right is None: + results.extend(node.datasets) + else: + self._query_recursive(node.left, kmer, results) + self._query_recursive(node.right, kmer, results) datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"] kmers_dict = { - "Dataset1": ["ACGT", "TGCA", "GCTA"], - "Dataset2": ["CGTA", "GCTA", "TACC"], - "Dataset3": ["AAGT", "TCCA", "CGGT"], - "Dataset4": ["TGGC", "GGCA", "CCAA"] + "Dataset1": ["ACGT", "TGCA", "GCTA"], + "Dataset2": ["CGTA", "GCTA", "TACC"], + "Dataset3": ["AAGT", "TCCA", "CGGT"], + "Dataset4": ["TGGC", "GGCA", "CCAA"] } #test structure = Structure(datasets, kmers_dict, bloom_size=100, num_hashes=1) query_kmers = ["GCTA", "TCCA", "ACGT", "GGGG"] for kmer in query_kmers: - result = structure.query(kmer) - print(f"K-mer '{kmer}' found in datasets: {result}") + result = structure.query(kmer) + print(f"K-mer '{kmer}' found in datasets: {result}") if __name__ == "__main__":