diff --git a/app.py b/app.py index 935a508946168fb8311b56ae5056eeb58c4bc727..587200ea0a37aa92a117cdeb228b6776352c5721 100644 --- a/app.py +++ b/app.py @@ -7,9 +7,13 @@ import numpy as np app = Flask(__name__) # Charger le modèle -with open('random_forest_model.pkl', 'rb') as model_file: +with open('random_forest_model_binaire.pkl', 'rb') as model_file: rf = pickle.load(model_file) +# Charger le scaler entraîné +with open('scaler_binaire.pkl', 'rb') as scaler_file: + scaler = pickle.load(scaler_file) + @app.route('/') def home(): return render_template('index.html') @@ -18,16 +22,42 @@ def home(): def predict(): # Récupérer les données du formulaire data = request.form.to_dict() - input_data = pd.DataFrame([data]) - + + # Calculer automatiquement le nombre de caractères sans espaces + if 'name' in data: + data['nb_caracteres_sans_espaces'] = len(data['name'].replace(" ", "")) + + # Supprimer la clé 'name' qui n'est pas une feature + data.pop('name', None) + + # Convertir les valeurs en float si possible + for key in data: + try: + data[key] = float(data[key]) + except ValueError: + pass # Garder les valeurs non convertibles (ex: texte) + + # Liste des features dans le bon ordre (comme lors de l'entraînement) + expected_features = ['year', 'acousticness', 'danceability', 'energy', 'explicit', + 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', + 'speechiness', 'tempo', 'valence', 'nb_artistes', 'featuring', + 'duree_minute', 'categorie_annee', 'categorie_tempo', 'nb_caracteres_sans_espaces'] + + # S'assurer que les colonnes du DataFrame correspondent à celles du modèle, dans le bon ordre + input_data = pd.DataFrame([[data.get(key, 0) for key in expected_features]], columns=expected_features) + + # Vérifier que toutes les colonnes attendues sont présentes + missing_cols = [col for col in expected_features if col not in input_data.columns] + if missing_cols: + return jsonify({'error': f'Missing features: {missing_cols}'}), 400 + # Normalisation des features - scaler = StandardScaler() - input_data_scaled = scaler.fit_transform(input_data) - + input_data_scaled = scaler.transform(input_data) + # Prédiction - predictions = np.round(rf.predict(input_data_scaled),4) - - return jsonify({'predictions': predictions.tolist()}) + predictions = rf.predict(input_data_scaled) + + return jsonify({'predictions': int(predictions[0])}) if __name__ == '__main__': - app.run(debug=True) + app.run(debug=True) \ No newline at end of file diff --git a/ml_binairefinal.py b/ml_binairefinal.py index 0688e9ed318f280dd16008fe2999bfcbf278f8aa..16f67a598790f19e66aeaeff94d44d32230dfe0a 100644 --- a/ml_binairefinal.py +++ b/ml_binairefinal.py @@ -5,6 +5,8 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report +import pickle + def load_and_describe_data(file_path): """Charge un fichier CSV et affiche les informations de base.""" @@ -12,16 +14,26 @@ def load_and_describe_data(file_path): print(df.info()) return df + def train_random_forest(df): start_time = time.time() # ⏳ Timer # 1️⃣ Séparation des features et de la cible y = df["popularity_2"] - X = df.drop(columns=['popularity_2', 'popularity', 'id', 'artists', 'name', + X = df.drop(columns=['popularity_2', 'popularity', 'id', 'artists', 'name', 'release_date', 'date_sortie', 'duration_ms', 'nom_artiste']) + # Liste des features dans le bon ordre (doit être identique dans l'API Flask) + expected_features = ['year', 'acousticness', 'danceability', 'energy', 'explicit', + 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', + 'speechiness', 'tempo', 'valence', 'nb_artistes', 'featuring', + 'duree_minute', 'categorie_annee', 'categorie_tempo', 'nb_caracteres_sans_espaces'] + + # Vérification des colonnes pour éviter les erreurs + X = X.reindex(columns=expected_features, fill_value=0) + # 2️⃣ Split train/test - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) # 3️⃣ Normalisation des features scaler = StandardScaler() @@ -29,49 +41,33 @@ def train_random_forest(df): X_test_scaled = scaler.transform(X_test) # 4️⃣ Initialisation du modèle avec les hyperparamètres optimisés - rf = RandomForestClassifier(n_estimators=200, max_depth=None, criterion="gini",class_weight='balanced') - + rf = RandomForestClassifier(n_estimators=200, max_depth=None, criterion="gini", class_weight='balanced', random_state=42) + # 5️⃣ Entraînement du modèle rf.fit(X_train_scaled, y_train) - import pickle - # Sauvegarder le modèle - with open('random_forest_model.pkl', 'wb') as model_file: + + # 6️⃣ Sauvegarde du modèle et du scaler + with open('random_forest_model_binaire.pkl', 'wb') as model_file: pickle.dump(rf, model_file) + with open('scaler_binaire.pkl', 'wb') as scaler_file: + pickle.dump(scaler, scaler_file) - # 6️⃣ Prédiction + # 7️⃣ Prédiction y_pred = rf.predict(X_test_scaled) - # 7️⃣ Évaluation + # 8️⃣ Évaluation accuracy = accuracy_score(y_test, y_pred) print(f"\n✅ Accuracy : {accuracy:.2%}") print("\n🔍 Rapport de classification :\n", classification_report(y_test, y_pred)) - # 8️⃣ Ajout des prédictions au DataFrame - df.loc[X_test.index, "pred_rf"] = y_pred - print(df.head(10)) - # ⏳ Temps d'exécution elapsed_time = time.time() - start_time print(f"\n⏱️ Temps d'exécution : {elapsed_time:.2f} secondes") return df + # 📂 Chargement et entraînement df = load_and_describe_data("data_binaire.csv") df = train_random_forest(df) - -def get_independent_variables(df): - # Séparation des features et de la cible - y = df["popularity_2"] - X = df.drop(columns=['popularity_2', 'popularity', 'id', 'artists', 'name', - 'release_date', 'date_sortie', 'duration_ms', 'nom_artiste']) - - # Liste des variables indépendantes - independent_variables = X.columns.tolist() - return independent_variables - -# Obtenir la liste des variables indépendantes -independent_variables = get_independent_variables(df) - -print(df['popularity_2'].value_counts()) \ No newline at end of file diff --git a/random_forest_model.pkl b/random_forest_model.pkl index 348339307c70b4af2bf48bbebfc2613bc6cc61b8..234110c4bf1057de8989392de18bc3ace42a9fda 100644 Binary files a/random_forest_model.pkl and b/random_forest_model.pkl differ diff --git a/random_forest_model_binaire.pkl b/random_forest_model_binaire.pkl new file mode 100644 index 0000000000000000000000000000000000000000..234110c4bf1057de8989392de18bc3ace42a9fda Binary files /dev/null and b/random_forest_model_binaire.pkl differ diff --git a/scaler.pkl b/scaler.pkl new file mode 100644 index 0000000000000000000000000000000000000000..19ab04faf34282436158f3112f223e18033a08ff Binary files /dev/null and b/scaler.pkl differ diff --git a/scaler_binaire.pkl b/scaler_binaire.pkl new file mode 100644 index 0000000000000000000000000000000000000000..19ab04faf34282436158f3112f223e18033a08ff Binary files /dev/null and b/scaler_binaire.pkl differ diff --git a/templates/index.html b/templates/index.html index 1aad35ee4c7d0330d179ab24bf4cb1d6434a82d5..a79ed2f7a26f5ceaf7cca8285c4708fe53db9cf3 100644 --- a/templates/index.html +++ b/templates/index.html @@ -21,7 +21,7 @@ margin-top: 10px; font-weight: bold; } - input[type="text"] { + input[type="text"], input[type="number"] { width: 100%; padding: 8px; margin-top: 5px; @@ -49,81 +49,80 @@ </head> <body> <h1>Prédire la Popularité</h1> - <form action="/predict" method="post"> + <form id="predictionForm"> + <label for="name">Titre de la chanson :</label> + <input type="text" id="name" name="name" required> + <label for="year">Année :</label> - <input type="text" id="year" name="year"> + <input type="number" id="year" name="year" required> <label for="acousticness">Acousticness :</label> - <input type="text" id="acousticness" name="acousticness"> + <input type="number" step="0.01" id="acousticness" name="acousticness" required> <label for="danceability">Danceability :</label> - <input type="text" id="danceability" name="danceability"> + <input type="number" step="0.01" id="danceability" name="danceability" required> <label for="energy">Energy :</label> - <input type="text" id="energy" name="energy"> + <input type="number" step="0.01" id="energy" name="energy" required> <label for="explicit">Explicit :</label> - <input type="text" id="explicit" name="explicit"> + <input type="number" id="explicit" name="explicit" required> <label for="instrumentalness">Instrumentalness :</label> - <input type="text" id="instrumentalness" name="instrumentalness"> + <input type="number" step="0.01" id="instrumentalness" name="instrumentalness" required> <label for="key">Key :</label> - <input type="text" id="key" name="key"> + <input type="number" id="key" name="key" required> <label for="liveness">Liveness :</label> - <input type="text" id="liveness" name="liveness"> + <input type="number" step="0.01" id="liveness" name="liveness" required> <label for="loudness">Loudness :</label> - <input type="text" id="loudness" name="loudness"> + <input type="number" step="0.01" id="loudness" name="loudness" required> <label for="mode">Mode :</label> - <input type="text" id="mode" name="mode"> + <input type="number" id="mode" name="mode" required> <label for="speechiness">Speechiness :</label> - <input type="text" id="speechiness" name="speechiness"> + <input type="number" step="0.01" id="speechiness" name="speechiness" required> <label for="tempo">Tempo :</label> - <input type="text" id="tempo" name="tempo"> + <input type="number" step="0.01" id="tempo" name="tempo" required> <label for="valence">Valence :</label> - <input type="text" id="valence" name="valence"> - - <label for="nb_caracteres_sans_espaces">Nombre de caractères sans espaces :</label> - <input type="text" id="nb_caracteres_sans_espaces" name="nb_caracteres_sans_espaces"> + <input type="number" step="0.01" id="valence" name="valence" required> <label for="nb_artistes">Nombre d'artistes :</label> - <input type="text" id="nb_artistes" name="nb_artistes"> + <input type="number" id="nb_artistes" name="nb_artistes" required> <label for="featuring">Featuring :</label> - <input type="text" id="featuring" name="featuring"> + <input type="number" id="featuring" name="featuring" required> <label for="duree_minute">Durée (minutes) :</label> - <input type="text" id="duree_minute" name="duree_minute"> + <input type="number" step="0.01" id="duree_minute" name="duree_minute" required> <label for="categorie_annee">Catégorie année :</label> - <input type="text" id="categorie_annee" name="categorie_annee"> + <input type="number" id="categorie_annee" name="categorie_annee" required> <label for="categorie_tempo">Catégorie tempo :</label> - <input type="text" id="categorie_tempo" name="categorie_tempo"> + <input type="number" id="categorie_tempo" name="categorie_tempo" required> <input type="submit" value="Prédire"> </form> <div id="result"></div> <script> - document.querySelector('form').addEventListener('submit', async function(event) { + document.getElementById("predictionForm").addEventListener("submit", async function(event) { event.preventDefault(); - const form = event.target; - const formData = new FormData(form); + const formData = new FormData(this); - const response = await fetch('/predict', { - method: 'POST', - body: formData, + const response = await fetch("/predict", { + method: "POST", + body: formData }); - + const result = await response.json(); - document.getElementById('result').innerText = `Prédiction: ${result.predictions}`; + document.getElementById("result").innerText = `Prédiction: ${result.predictions}`; }); </script> </body>