Skip to content
Snippets Groups Projects
Commit 452994dd authored by University's avatar University
Browse files

Docstrings and Doctests

parent 71c7ebd7
No related branches found
No related tags found
No related merge requests found
import doctest
import hashlib
from typing import List
class SimpleBloomFilter:
def __init__(self, size=100, num_hashes=1):
"""
constructing a bloom filter
>>> bf = SimpleBloomFilter(size = 10, num_hashes=2)
>>> bf.add("AGCT")
>>> bf.contains("AGCT")
True
>>> bf.contains("TGCA")
False
"""
def __init__(self, size: int =100, num_hashes: int =1):
"""
Create a new bloom filter
:param size: The number of bits in the filter
:param num_hashes: The number of hashes to use
"""
self.size = size
self.num_hashes = num_hashes
self.bit_array = [0] * size
def _hashes(self, item):
def _hashes(self, item: str) -> List[int]:
"""
Create hash positions for an item
:param item: The item to hash
:return: A list of hash positions
"""
hash_values = []
for i in range(self.num_hashes):
hash_func = hashlib.sha256((str(i) + item).encode()).hexdigest()
hash_values.append(int(hash_func, 16) % self.size)
return hash_values
def add(self, item):
def add(self, item: str) -> None:
"""
Add an item to the bloom filter
:param item: The item to add
"""
for pos in self._hashes(item):
self.bit_array[pos] = 1
def contains(self, item):
def contains(self, item:str) -> bool:
"""
check if an item is in the bloom filter
:param item: The item to check
:return: True if the item is in the bloom filter, False otherwise
"""
return all(self.bit_array[pos] for pos in self._hashes(item))
def merge(self, other):
def merge(self, other: 'SimpleBloomFilter') -> 'SimpleBloomFilter':
"""
Merges this bloom filter with another one of the same size.
:param other: Another SimpleBloomFilter
:return: A new SimpleBloomFilter containing the combined data
"""
assert self.size == other.size, "Bloom filters must be of the same size!"
merged_filter = SimpleBloomFilter(self.size, self.num_hashes)
merged_filter.bit_array = [a | b for a, b in zip(self.bit_array, other.bit_array)]
......@@ -28,18 +65,55 @@ class SimpleBloomFilter:
class StructureNode:
"""
A node in the structure tree.
"""
def __init__(self, bloom_filter=None):
"""
Create a new structure node
:param bloom_filter: A bloom filter for the node
"""
self.bloom = bloom_filter if bloom_filter else SimpleBloomFilter()
self.left = None
self.right = None
self.datasets = [] # list of dataset names at leaf nodes
class Structure:
def __init__(self, datasets, kmers_dict, bloom_size=10000, num_hashes=3):
"""
A tree structure that organize datasets using bloom filters.
>>> datasets = ["Dataset1", "Dataset2"]
>>> kmers_dict = {"Dataset1": ["ACGT", "TGCA"], "Dataset2": ["CGTA", "GCTA"]}
>>> structure = Structure(datasets, kmers_dict, bloom_size=10, num_hashes=1)
>>> structure.query("ACGT")
['Dataset1']
>>> structure.query("CGTA")
['Dataset2']
>>> structure.query("TTTT")
[]
"""
def __init__(self, datasets: list[str], kmers_dict: dict[str, list[str]], bloom_size: int =10000, num_hashes: int =3):
"""
Create a tree structure from datasets and their k-mers.
:param datasets: A list of datasets
:param kmers_dict: A dictionary of k-mers
:param bloom_size: The size of the bloom filters
:param num_hashes: The number of hashes to use
"""
self.leaves = {} # maps dataset names to their Bloom filter nodes
self.root = self._build_tree(datasets, kmers_dict, bloom_size, num_hashes)
def _build_tree(self, datasets, kmers_dict, bloom_size, num_hashes):
def _build_tree(self, datasets: list[str], kmers_dict: dict[str, list[str]], bloom_size: int, num_hashes: int) -> StructureNode:
"""
Build a tree structure from dataset k-mers using bloom filters.
:param datasets: A list of datasets
:param kmers_dict: A dictionary of k-mers
:param bloom_size: The size of the bloom filters
:param num_hashes: The number of hashes to use
:return: The root of the tree
"""
nodes = []
# Step 1
......@@ -69,12 +143,23 @@ class Structure:
return nodes[0] if nodes else None
def query(self, kmer):
def query(self, kmer: str) -> List[str]:
"""
Searches the structure tree for a given kmer
:param kmer: The k-mer to search for
:return: A list of dataset names that may contain the k-mer
"""
results = []
self._query_recursive(self.root, kmer, results)
return results
def _query_recursive(self, node, kmer, results):
def _query_recursive(self, node, kmer: str, results: list[str]) -> None:
"""
Recursively searches the structure tree for a given kmer
:param node: The current node
:param kmer: The k-mer to search for
:param results: A list to store matching dataset names
"""
if node is None:
return
if node.bloom.contains(kmer):
......@@ -85,6 +170,10 @@ class Structure:
self._query_recursive(node.right, kmer, results)
if __name__ == "__main__":
doctest.testmod()
datasets = ["Dataset1", "Dataset2", "Dataset3", "Dataset4"]
kmers_dict = {
"Dataset1": ["ACGT", "TGCA", "GCTA"],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment