diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..04aedb24d021cb846006d38dec988ff2716e73ed --- /dev/null +++ b/app.py @@ -0,0 +1,32 @@ +from flask import Flask, request, jsonify, render_template +import pickle +import pandas as pd +from sklearn.preprocessing import StandardScaler + +app = Flask(__name__) + +# Charger le modèle +with open('random_forest_model.pkl', 'rb') as model_file: + rf = pickle.load(model_file) + +@app.route('/') +def home(): + return render_template('index.html') + +@app.route('/predict', methods=['POST']) +def predict(): + # Récupérer les données du formulaire + data = request.form.to_dict() + input_data = pd.DataFrame([data]) + + # Normalisation des features + scaler = StandardScaler() + input_data_scaled = scaler.fit_transform(input_data) + + # Prédiction + predictions = rf.predict(input_data_scaled) + + return jsonify({'predictions': predictions.tolist()}) + +if __name__ == '__main__': + app.run(debug=True) diff --git a/ml_binairefinal.py b/ml_binairefinal.py new file mode 100644 index 0000000000000000000000000000000000000000..c355b6ed4d6aaa0503781678d4f92be1352cf028 --- /dev/null +++ b/ml_binairefinal.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd +import time +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, classification_report + +def load_and_describe_data(file_path): + """Charge un fichier CSV et affiche les informations de base.""" + df = pd.read_csv(file_path) + print(df.info()) + return df + +def train_random_forest(df): + start_time = time.time() # ⏳ Timer + + # 1️⃣ Séparation des features et de la cible + y = df["popularity_2"] + X = df.drop(columns=['popularity_2', 'popularity', 'id', 'artists', 'name', + 'release_date', 'date_sortie', 'duration_ms', 'nom_artiste']) + + # 2️⃣ Split train/test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) + + # 3️⃣ Normalisation des features + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + # 4️⃣ Initialisation du modèle avec les hyperparamètres optimisés + rf = RandomForestClassifier(n_estimators=200, max_depth=None, criterion="gini") + + # 5️⃣ Entraînement du modèle + rf.fit(X_train_scaled, y_train) + import pickle + # Sauvegarder le modèle + with open('random_forest_model.pkl', 'wb') as model_file: + pickle.dump(rf, model_file) + + + # 6️⃣ Prédiction + y_pred = rf.predict(X_test_scaled) + + # 7️⃣ Évaluation + accuracy = accuracy_score(y_test, y_pred) + print(f"\n✅ Accuracy : {accuracy:.2%}") + print("\n🔍 Rapport de classification :\n", classification_report(y_test, y_pred)) + + # 8️⃣ Ajout des prédictions au DataFrame + df.loc[X_test.index, "pred_rf"] = y_pred + print(df.head(10)) + + # ⏳ Temps d'exécution + elapsed_time = time.time() - start_time + print(f"\n⏱️ Temps d'exécution : {elapsed_time:.2f} secondes") + + return df + +# 📂 Chargement et entraînement +df = load_and_describe_data("data_binaire.csv") +df = train_random_forest(df) + +def get_independent_variables(df): + # Séparation des features et de la cible + y = df["popularity_2"] + X = df.drop(columns=['popularity_2', 'popularity', 'id', 'artists', 'name', + 'release_date', 'date_sortie', 'duration_ms', 'nom_artiste']) + + # Liste des variables indépendantes + independent_variables = X.columns.tolist() + return independent_variables + +# Obtenir la liste des variables indépendantes +independent_variables = get_independent_variables(df) \ No newline at end of file diff --git a/mlsup0final.py b/mlsup0final.py new file mode 100644 index 0000000000000000000000000000000000000000..c4b60104aab69bcc110af65eee50ee309b589888 --- /dev/null +++ b/mlsup0final.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd +import time +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score + +def load_and_describe_data(file_path): + """Charge un fichier CSV et affiche les informations de base.""" + df = pd.read_csv(file_path) + print(df.info()) + return df + +def train_random_forest(df): + start_time = time.time() # ⏳ Timer + + # 1️⃣ Séparation des features et de la cible + X = df.drop(columns=["popularity", "id", "artists", "name", "release_date", "date_sortie", "duration_ms", "nom_artiste"]) + y = df["popularity"] + + # 2️⃣ Split train/test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # 3️⃣ Normalisation des features (optionnel pour RandomForest, mais peut aider) + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + # 4️⃣ Entraînement du modèle RandomForest avec des hyperparamètres fixes + rf = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=2, random_state=42) + rf.fit(X_train_scaled, y_train) + + # 5️⃣ Prédiction sur les données de test + y_pred = rf.predict(X_test_scaled) + + # 6️⃣ Évaluation du modèle + mae = mean_absolute_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + r2 = r2_score(y_test, y_pred) + + print(f"\n📊 RandomForestRegressor - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}") + + # 7️⃣ Ajout des prédictions au DataFrame + df.loc[X_test.index, "pred_rf"] = y_pred + + # 8️⃣ Affichage de l’importance des features + feature_importance = dict(zip(X.columns, rf.feature_importances_)) + sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True) + + print("\n🔥 Importance des variables :") + for feature, importance in sorted_importance[:10]: # Affichage des 10 plus importantes + print(f"{feature}: {importance:.4f}") + + # ⏳ Temps d'exécution + elapsed_time = time.time() - start_time + print(f"\n⏱️ Temps d'exécution : {elapsed_time:.2f} secondes") + print(df[['popularity', 'pred_rf']].head(30)) + return df + +# 📂 Chargement et entraînement +df = load_and_describe_data("data_sup_0popularity.csv") +df = train_random_forest(df) diff --git a/modelbinairefinal.txt b/modelbinairefinal.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cefe03b2d9a6ca6cee1fa68b011cc28ae73b969 --- /dev/null +++ b/modelbinairefinal.txt @@ -0,0 +1,65 @@ +nohup: ignoring input +<class 'pandas.core.frame.DataFrame'> +RangeIndex: 127979 entries, 0 to 127978 +Data columns (total 28 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 id 127979 non-null object + 1 artists 127979 non-null object + 2 name 127979 non-null object + 3 year 127979 non-null int64 + 4 acousticness 127979 non-null float64 + 5 danceability 127979 non-null float64 + 6 duration_ms 127979 non-null int64 + 7 energy 127979 non-null float64 + 8 explicit 127979 non-null int64 + 9 instrumentalness 127979 non-null float64 + 10 key 127979 non-null int64 + 11 liveness 127979 non-null float64 + 12 loudness 127979 non-null float64 + 13 mode 127979 non-null int64 + 14 release_date 127979 non-null object + 15 speechiness 127979 non-null float64 + 16 tempo 127979 non-null float64 + 17 valence 127979 non-null float64 + 18 popularity 127979 non-null int64 + 19 date_sortie 127979 non-null object + 20 nom_artiste 127979 non-null object + 21 nb_caracteres_sans_espaces 127979 non-null int64 + 22 nb_artistes 127979 non-null int64 + 23 featuring 127979 non-null int64 + 24 duree_minute 127979 non-null float64 + 25 categorie_annee 127979 non-null int64 + 26 categorie_tempo 127979 non-null int64 + 27 popularity_2 127979 non-null int64 +dtypes: float64(10), int64(12), object(6) +memory usage: 27.3+ MB +None + +✅ Accuracy : 91.57% + +🔍 Rapport de classification : + precision recall f1-score support + + 0 0.82 0.82 0.82 5964 + 1 0.95 0.94 0.95 19632 + + accuracy 0.92 25596 + macro avg 0.88 0.88 0.88 25596 +weighted avg 0.92 0.92 0.92 25596 + + id ... pred_rf +0 7pQSmQ0l7QdBeL9X6CEMbH ... NaN +1 2g8MTBFRUSFKbY5RQiSSEE ... NaN +2 2LcJoQ5SDUZrC2qUjWMEdF ... NaN +3 6RzHyUtRNARYyn2AIuoLnY ... NaN +4 6Kd0I5es8911FZpYhFS053 ... NaN +5 0bRAVjALdeHy5hnogfnRkT ... NaN +6 0IhY390qx5QJEnRXpeuEwq ... NaN +7 1e7M98usgS7tK89PoEbqpz ... NaN +8 4gflNrWUQVemKn7It2kRKK ... NaN +9 0JR24LYC8buXcxKXmUT9kt ... 0.0 + +[10 rows x 29 columns] + +⏱️ Temps d'exécution : 57.46 secondes diff --git a/modelsup0final.txt b/modelsup0final.txt new file mode 100644 index 0000000000000000000000000000000000000000..e95cdb7bf8cfd467a5f71781da7b1445a4b731d1 --- /dev/null +++ b/modelsup0final.txt @@ -0,0 +1,83 @@ +nohup: ignoring input +<class 'pandas.core.frame.DataFrame'> +RangeIndex: 98159 entries, 0 to 98158 +Data columns (total 27 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 id 98159 non-null object + 1 artists 98159 non-null object + 2 name 98159 non-null object + 3 year 98159 non-null int64 + 4 acousticness 98159 non-null float64 + 5 danceability 98159 non-null float64 + 6 duration_ms 98159 non-null int64 + 7 energy 98159 non-null float64 + 8 explicit 98159 non-null int64 + 9 instrumentalness 98159 non-null float64 + 10 key 98159 non-null int64 + 11 liveness 98159 non-null float64 + 12 loudness 98159 non-null float64 + 13 mode 98159 non-null int64 + 14 release_date 98159 non-null object + 15 speechiness 98159 non-null float64 + 16 tempo 98159 non-null float64 + 17 valence 98159 non-null float64 + 18 popularity 98159 non-null int64 + 19 date_sortie 98159 non-null object + 20 nom_artiste 98159 non-null object + 21 nb_caracteres_sans_espaces 98159 non-null int64 + 22 nb_artistes 98159 non-null int64 + 23 featuring 98159 non-null int64 + 24 duree_minute 98159 non-null float64 + 25 categorie_annee 98159 non-null int64 + 26 categorie_tempo 98159 non-null int64 +dtypes: float64(10), int64(11), object(6) +memory usage: 20.2+ MB +None + +📊 RandomForestRegressor - MAE: 8.82, RMSE: 12.04, R²: 0.607 + +🔥 Importance des variables : +year: 0.5028 +instrumentalness: 0.0748 +nb_caracteres_sans_espaces: 0.0516 +loudness: 0.0402 +duree_minute: 0.0397 +liveness: 0.0385 +acousticness: 0.0375 +energy: 0.0343 +danceability: 0.0343 +valence: 0.0336 + +⏱️ Temps d'exécution : 284.81 secondes + popularity pred_rf +0 24 NaN +1 38 NaN +2 40 NaN +3 20 NaN +4 47 NaN +5 32 NaN +6 52 NaN +7 21 NaN +8 36 36.570159 +9 6 NaN +10 35 NaN +11 9 NaN +12 37 NaN +13 63 NaN +14 21 NaN +15 1 NaN +16 30 NaN +17 48 NaN +18 57 NaN +19 19 NaN +20 34 NaN +21 18 NaN +22 31 NaN +23 58 NaN +24 5 NaN +25 34 NaN +26 15 NaN +27 5 NaN +28 30 NaN +29 35 NaN diff --git a/random_forest_model.pkl b/random_forest_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..53681515e3a7abfedbdbcc78e0c9bb4dbeeb691a Binary files /dev/null and b/random_forest_model.pkl differ diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000000000000000000000000000000000000..1aad35ee4c7d0330d179ab24bf4cb1d6434a82d5 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,130 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>Prédiction de Popularité</title> + <style> + body { + font-family: Arial, sans-serif; + margin: 20px; + } + h1 { + color: #333; + } + form { + max-width: 600px; + margin: auto; + } + label { + display: block; + margin-top: 10px; + font-weight: bold; + } + input[type="text"] { + width: 100%; + padding: 8px; + margin-top: 5px; + border: 1px solid #ccc; + border-radius: 4px; + } + input[type="submit"] { + margin-top: 20px; + padding: 10px 20px; + background-color: #4CAF50; + color: white; + border: none; + border-radius: 4px; + cursor: pointer; + } + input[type="submit"]:hover { + background-color: #45a049; + } + #result { + margin-top: 20px; + font-size: 1.2em; + color: #555; + } + </style> +</head> +<body> + <h1>Prédire la Popularité</h1> + <form action="/predict" method="post"> + <label for="year">Année :</label> + <input type="text" id="year" name="year"> + + <label for="acousticness">Acousticness :</label> + <input type="text" id="acousticness" name="acousticness"> + + <label for="danceability">Danceability :</label> + <input type="text" id="danceability" name="danceability"> + + <label for="energy">Energy :</label> + <input type="text" id="energy" name="energy"> + + <label for="explicit">Explicit :</label> + <input type="text" id="explicit" name="explicit"> + + <label for="instrumentalness">Instrumentalness :</label> + <input type="text" id="instrumentalness" name="instrumentalness"> + + <label for="key">Key :</label> + <input type="text" id="key" name="key"> + + <label for="liveness">Liveness :</label> + <input type="text" id="liveness" name="liveness"> + + <label for="loudness">Loudness :</label> + <input type="text" id="loudness" name="loudness"> + + <label for="mode">Mode :</label> + <input type="text" id="mode" name="mode"> + + <label for="speechiness">Speechiness :</label> + <input type="text" id="speechiness" name="speechiness"> + + <label for="tempo">Tempo :</label> + <input type="text" id="tempo" name="tempo"> + + <label for="valence">Valence :</label> + <input type="text" id="valence" name="valence"> + + <label for="nb_caracteres_sans_espaces">Nombre de caractères sans espaces :</label> + <input type="text" id="nb_caracteres_sans_espaces" name="nb_caracteres_sans_espaces"> + + <label for="nb_artistes">Nombre d'artistes :</label> + <input type="text" id="nb_artistes" name="nb_artistes"> + + <label for="featuring">Featuring :</label> + <input type="text" id="featuring" name="featuring"> + + <label for="duree_minute">Durée (minutes) :</label> + <input type="text" id="duree_minute" name="duree_minute"> + + <label for="categorie_annee">Catégorie année :</label> + <input type="text" id="categorie_annee" name="categorie_annee"> + + <label for="categorie_tempo">Catégorie tempo :</label> + <input type="text" id="categorie_tempo" name="categorie_tempo"> + + <input type="submit" value="Prédire"> + </form> + <div id="result"></div> + + <script> + document.querySelector('form').addEventListener('submit', async function(event) { + event.preventDefault(); + const form = event.target; + const formData = new FormData(form); + + const response = await fetch('/predict', { + method: 'POST', + body: formData, + }); + + const result = await response.json(); + document.getElementById('result').innerText = `Prédiction: ${result.predictions}`; + }); + </script> +</body> +</html>