Skip to content
Snippets Groups Projects
Commit 68fda1a1 authored by Pacome Riobe's avatar Pacome Riobe
Browse files

Mise à jour et ajout de parsers

parent a1dd29f8
No related branches found
No related tags found
No related merge requests found
import argparse
import matplotlib.pyplot as plt
import gffutils
import pandas as pd
from collections import Counter
#STAT
def avg_lenght(db):
avg = Counter()
# Longueur moyenne des Features :
CDS_lengths = [feature.end - feature.start + 1 for feature in db.features_of_type("CDS")]
region_lengths = [feature.end - feature.start + 1 for feature in db.features_of_type("region")]
coding_exon_lengths = [feature.end - feature.start + 1 for feature in db.features_of_type("coding_exon")]
intron_lengths = [feature.end - feature.start + 1 for feature in db.features_of_type("intron")]
avg_CDS_length = sum(CDS_lengths) / len(CDS_lengths) if CDS_lengths else 0
avg_region_length = sum(region_lengths) / len(region_lengths) if region_lengths else 0
avg_coding_exon_length = sum(coding_exon_lengths) / len(coding_exon_lengths) if coding_exon_lengths else 0
avg_intron_length = sum(intron_lengths) / len(intron_lengths) if intron_lengths else 0
avg["CDS"] = avg_CDS_length
avg["region"] = avg_region_length
avg["coding_exon"] = avg_coding_exon_length
avg["intron"] = avg_intron_length
return avg
def count(db):
"""
Stat
Parameters:
- db: base de donné
Returns:
- stats (dict) : exemple --> Counter({'coding_exon': 4, 'intron': 3, 'region': 1, 'CDS': 1, 'average_gene_length': 0})
"""
stats = Counter()
for feature in db.all_features(): #methode gffutils
stats[feature.featuretype] += 1
return stats
def fusions_stat(count, avg):
"""
Combinaisons de plusieurs dictionnaires stat dans 1 tableau
Parameters:
- stats (dict): Un dictionnaire avec le nombre de chaque type de feature.
- avg (dict): Un dictionnaire avec la longueur moyenne de chaque type de feature.
Returns:
- Un unique tableau et sa c'est beau
"""
# Transformer les dict en tableau
count_df = pd.DataFrame(count.items(), columns=["Feature Type", "Count"])
avg_df = pd.DataFrame(avg.items(), columns=["Feature Type", "Average Length"])
# Fusion les tableaux sur la colonne "Feature Type"
combined_df = pd.merge(count_df, avg_df, on="Feature Type", how="left") #faut que klé des deux dict soit les memes
return combined_df
#LIEN
def liens(feature):
"""
LIEN NCBI
Parameters:
- feature (gffutils.Feature): Un objet représentant une feature du fichier GFF.
Returns:
- str: lienS
"""
base_url = "https://www.ncbi.nlm.nih.gov/gene/?term="
# ID de gène
if "ID" in feature.attributes:
gene_id = feature.attributes["ID"][0] # Prend la première valeur si plusieurs
return f'<a href="{base_url}{gene_id}" target="_blank">{gene_id}</a>'
# Si CDS --> utilise son id
if "CDS" in feature.attributes:
cds_id = feature.attributes["CDS"][0]
return f'<a href="{base_url}{cds_id}" target="_blank">{cds_id}</a>'
return "N/A" # Si aucun lien disponible
def graphe(stats):
"""
Génère un histogramme de la distribution des features du GFF.
Parameters:
- stats (dict): Dictionnaire contenant le nombre de chaque type de feature.
- output_image (str): Nom du fichier image pour sauvegarder le graphique.
Returns:
- Sauvegarde un fichier PNG et affiche le graphique.
"""
# Filtrer pour ne garder que les features (et pas les moyennes de longueur)
filtered_stats = {k: v for k, v in stats.items() if not k.startswith("average")}
# Création du graphique
plt.figure(figsize=(10, 5))
plt.bar(filtered_stats.keys(), filtered_stats.values(), color='skyblue')
# Ajout des labels
plt.xlabel("Feature Type")
plt.ylabel("Count")
plt.title("Feature Distribution in GFF File")
plt.xticks(rotation=45) # Rotation des labels pour lisibilité
# Sauvegarde et affichage
return plt.show()
def gff_a_html(gff_file, output_html):
"""
......@@ -12,7 +126,7 @@ def gff_a_html(gff_file, output_html):
Returns:
- rien mais enregistre un fichier HTML avec un tableau de donnée la ou on lui a demandé
"""
# base de données temporaire du fichier
# base de données temporaire du fichier
db = gffutils.create_db(gff_file, dbfn=":memory:", force=True, keep_order=True, merge_strategy="create_unique", sort_attribute_values=True)
# memory : pour pas stocker sur disque (RAM)
#force = true : recrée base si elle existe déja
......@@ -28,31 +142,45 @@ def gff_a_html(gff_file, output_html):
"Start": feature.start,
"End": feature.end,
"Strand": feature.strand,
"Locus ID": feature.attributes.get("locus_tag", ["N/A"])[0],
"Parent": feature.attributes.get("Parent", ["None"])[0],
"Gene Product": feature.attributes.get("product", ["Unknown"])[0],
"Sequence": feature.attributes.get('Sequence'),
"CDS": feature.attributes.get('CDS'),
"External Link": liens(feature)
}
data.append(entry)
# TABLEAU :
df = pd.DataFrame(data, columns=["ID", "Start", "End", Strand]) #le reste fonctionne pas
# DATA
df = pd.DataFrame(data, columns=["ID", "Start", "End", "Strand", "Sequence", "CDS", "External Link"])
df_html = df.to_html(index=False, escape=False) #html
# fichier HTML
df.to_html(output_html, index=False)
# STAT
cnt = count(db)
avg = avg_lenght(db)
stats_df = fusions_stat(cnt, avg)
stats_html = stats_df.to_html(index=False) #html
print(f"HTML table generated: {output_html}")
# GRAPH
graph = graphe(cnt)
graph_html = graph.to_html(index=False)
# TOUS les tableaux dans 1 fichier HTML
with open(output_html, "w", encoding="utf-8") as f:
f.write("<h1> DATA </h1>\n")
f.write(df_html) # 1er tableau
f.write("<h1> Statistics </h1>\n")
f.write(stats_html) # 2e tableau
f.write("<h1> Graph </h1>\n")
f.write(graph_html)
if __name__ == "__main__":
#chemin des fichiers de Clara :
#gff_file = "/Users/claramoreno/PycharmProjects/PythonProjectPaster/wormbase_gff2_alt.txt"
#output_html = "/Users/claramoreno/PycharmProjects/PythonProjectPaster/output.html"
print(f"HTML file saved as {output_html}")
#chemin des fichiers :
#gff_file =
#output_html =
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-g", "--gff_file", type=str, help="Enter a gff file")
parser.add_argument("-o", "--outfile", type=str, help="Enter a path for the outfile")
args = parser.parse_args()
print(gff_a_html(args.gff_file,args.outfile))
print(gff_a_html(gff_file,output_html))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment