diff --git a/Module_correlation.Rmd b/Module_correlation.Rmd index 448926d4e18fefdc657d8c3725d8f163cea0e5cd..281bc0f66a0f7f8a42994014c9dcc98699869708 100644 --- a/Module_correlation.Rmd +++ b/Module_correlation.Rmd @@ -3,9 +3,18 @@ title: '"Module d''analyse des corrélations des annotations des échantillons"' output: html_document --- + +```{r setup, include=FALSE} + +# Global options: hide code, warnings and messages in the final report. +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) + +``` + + + ```{r load_packages, include=FALSE} # Installer et charger les packages nécessaires - if(!require(GGally)) install.packages("GGally") if(!require(readr)) install.packages("readr") if(!require(dplyr)) install.packages("dplyr") @@ -28,24 +37,31 @@ library(ComplexHeatmap) library(RColorBrewer) library(reshape2) + ``` + +1. Data Import and Exploration ```{r} -# Importer le fichier d'annotations (ex: CSV) -setwd("C:/Users/User/Desktop/projet_visualisation") -annotations <- read.csv("design_test.csv", sep=",", stringsAsFactors = FALSE) +# Import the design file (CSV) +annotations <- read.csv(design_file, sep = ",", stringsAsFactors = FALSE) + +# Convert specified categorical variables to factors. +annotations <- annotations %>% + mutate(across(all_of(categorical_vars), as.factor)) -# Aperçu de la structure et résumé des données +# Overview of data structure and summary. glimpse(annotations) summary(annotations) -# Vérification des valeurs manquantes par colonne +# Check missing values per column. missing_values <- colSums(is.na(annotations)) print(missing_values) + ``` -2. Séparation et Conversion des Variables +2. Variable Separation and Conversion ```{r} @@ -61,94 +77,80 @@ cat("Variables catégorielles détectées : ", paste(colnames(cat_data), collaps ``` -2.1 Conversion Automatisée pour le Calcul de Corrélation - +2.1 Automated Conversion for Correlation Analysis +Note: For correlation analysis, only the quantitative variables are used. ```{r} -convert_to_numeric <- function(df, date_cols = NULL, drop_cols = NULL, date_format = "%d.%m.%Y") { - # Supprime les colonnes non désirées - if (!is.null(drop_cols)) { - df <- df %>% select(-all_of(drop_cols)) - } - - df_numeric <- df %>% mutate(across(everything(), ~ { - if (is.numeric(.)) { - return(.) - } else if (is.character(.)) { - # Si la colonne est spécifiée comme date ou peut être parsée en date - if (!is.null(date_cols) && cur_column() %in% date_cols) { - return(as.numeric(as.Date(., format = date_format))) - } else { - parsed_date <- suppressWarnings(as.Date(., format = date_format)) - if (all(!is.na(parsed_date))) { - return(as.numeric(parsed_date)) - } else { - return(as.numeric(factor(.))) - } - } - } else if (is.factor(.)) { - return(as.numeric(.)) - } else { - return(as.numeric(.)) - } - })) - - return(df_numeric) -} +data_numeric <- annotations %>% + mutate( + condition = as.numeric(factor(condition)), # Conversion des catégories en indices numériques + animal = as.numeric(factor(animal)), + experiment = as.numeric(as.Date(experiment, format = "%d.%m.%Y")), # Conversion en nombre + extraction = as.numeric(as.Date(extraction, format = "%d.%m.%Y")), + sample = as.numeric(factor(sample)) # Conversion de 'sample' en indice numérique + ) %>% + select(-sample) # Exclure 'sample' du calcul de corrélation -# Exemple : traiter "experiment" et "extraction" comme dates, et supprimer "sample" -data_numeric <- convert_to_numeric(annotations, - date_cols = c("experiment", "extraction"), - drop_cols = c("sample")) str(data_numeric) ``` -3. Visualisations de Corrélation et Distributions -3.1 Matrice de Corrélation avec corrplot +3. Visualizations of Correlations and Distributions +3.1 Correlation Matrix with corrplot +Figure: Correlation matrix showing pairwise Pearson correlations among quantitative variables. ```{r} if(ncol(data_numeric) > 1){ + # Calcul de la matrice de corrélation (méthode Pearson) cor_matrix <- cor(data_numeric, use = "complete.obs", method = "pearson") + + # Génération de la figure de corrélation corrplot(cor_matrix, method = "circle", type = "lower", tl.col = "black", tl.cex = 0.8, col = colorRampPalette(c("blue", "white", "red"))(200)) } else { - cat("Pas assez de colonnes numériques pour calculer une matrice de corrélation.\n") + cat("Not enough numeric columns to compute a correlation matrix.\n") } + ``` -3.2 Heatmap avec ComplexHeatmap +3.2 Heatmap with ComplexHeatmap +Figure: Heatmap of the correlation matrix. ```{r} if(ncol(data_numeric) > 1){ - Heatmap(cor_matrix, name = "Corrélation", + Heatmap(cor_matrix, name = "Correlation", col = colorRampPalette(brewer.pal(8, "RdYlBu"))(50), - column_title = "Heatmap de la matrice de corrélation") + column_title = "Heatmap of Correlation Matrix") } else { - cat("Pas assez de variables numériques pour générer une heatmap.\n") + cat("Not enough numeric variables to generate a heatmap.\n") } + ``` -3.3 Pairs Plot des Variables Numériques +3.3 Pairs Plot of Numeric Variables +Figure: Pairs plot (scatterplot matrix) of quantitative variables. + + ```{r} if(ncol(num_data) > 1){ - ggpairs(num_data, title = "Pairs Plot des variables numériques", progress = FALSE) + ggpairs(num_data, title = "Pairs Plot of Numeric Variables", progress = FALSE) } else { - cat("Pas assez de variables numériques pour générer un pairs plot.\n") + cat("Not enough numeric variables to generate a pairs plot.\n") } + ``` -3.4 Distribution des Variables Numériques -Pour chaque variable numérique, un histogramme et une courbe de densité sont affichés. +3.4 Distribution of Numeric Variables +Figure: Histograms with density curves for each quantitative variable. ```{r} numeric_vars <- names(data_numeric) for (var in numeric_vars) { p <- ggplot(data_numeric, aes_string(x = var)) + geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) + geom_density(color = "red", size = 1) + - labs(title = paste("Distribution de", var), - x = var, y = "Densité") + + labs(title = paste("Distribution of", var), + x = var, y = "Density") + theme_minimal() print(p) } @@ -156,8 +158,9 @@ for (var in numeric_vars) { ``` -3.5 Scatter Plots pour Paires de Variables Fortement Corrélées -On identifie les paires de variables dont la corrélation absolue dépasse un seuil (ici 0.7) et on affiche un scatter plot avec une droite de régression. +3.5 Scatter Plots for Highly Correlated Pairs +Figure: Scatter plots with linear regression lines for pairs of variables with |correlation| > 0.7. + ```{r} cor_threshold <- 0.7 if(ncol(data_numeric) > 1){ @@ -171,89 +174,152 @@ if(ncol(data_numeric) > 1){ p <- ggplot(data_numeric, aes_string(x = var1, y = var2)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE, color = "red") + - labs(title = paste("Scatter Plot de", var1, "vs", var2), - subtitle = paste("Corrélation =", round(cor_matrix[high_cor_pairs[i,1], high_cor_pairs[i,2]], 2))) + + labs(title = paste("Scatter Plot:", var1, "vs", var2), + subtitle = paste("Correlation =", round(cor_matrix[high_cor_pairs[i,1], high_cor_pairs[i,2]], 2))) + theme_minimal() print(p) } } else { - cat("Aucune paire de variables avec une corrélation absolue supérieure à", cor_threshold, "\n") + cat("No variable pairs with an absolute correlation above", cor_threshold, "\n") } } else { - cat("Pas assez de variables numériques pour générer des scatter plots.\n") + cat("Not enough numeric variables to generate scatter plots.\n") } - ``` -3.6 Distribution des Variables Catégorielles + +3.6 Distribution of Categorical Variables +Figure: Frequency distributions (bar charts) for each categorical variable. + ```{r} cat_vars <- names(cat_data) for (var in cat_vars) { p <- ggplot(annotations, aes_string(x = var)) + geom_bar(fill = "blue", alpha = 0.7) + - labs(title = paste("Distribution de la variable catégorielle :", var), - x = var, y = "Fréquence") + + labs(title = paste("Frequency Distribution of", var), + x = var, y = "Count") + theme_minimal() print(p) } ``` -3.7 Visualisation de la Variable sample +3.7 Detailed Visualization of the 'sample' Variable +Even though the unique sample identifiers are not necessarily used in every figure, it is important to retain and examine their distribution. +Here we provide a dedicated bar chart for the sample variable. + ```{r} if("sample" %in% names(annotations)){ - p <- ggplot(annotations, aes(x = factor(sample))) + + # Ensure 'sample' is treated as a factor and preserve its order. + annotations$sample <- factor(annotations$sample, levels = unique(annotations$sample)) + + # Bar chart for 'sample' + p <- ggplot(annotations, aes(x = sample)) + geom_bar(fill = "steelblue", alpha = 0.7) + - labs(title = "Distribution de la variable 'sample'", - x = "Sample", y = "Fréquence") + + labs(title = "Frequency Distribution of 'sample'", + x = "Sample ID", y = "Count") + theme_minimal() print(p) } else { - cat("La variable 'sample' n'existe pas dans les données.\n") + cat("The variable 'sample' does not exist in the dataset.\n") } - ``` - -3.8 Visualisation Avancée (Matrice Améliorée) +3.8 Advanced Visualization: Improved Pairs Plot +Figure: Advanced pairs plot of selected categorical variables (excluding 'sample') to provide a global view of group relationships. ```{r} -# Créer une copie des données pour conversion en facteurs -data_factors <- annotations - -# Conversion des colonnes spécifiques en facteurs si elles existent -cols_to_factor <- c("condition", "animal", "experiment", "extraction") -for (col in cols_to_factor) { - if (col %in% names(data_factors)) { - data_factors[[col]] <- as.factor(data_factors[[col]]) +# Create a copy of the data for the pairs plot. +# Retain 'sample' in the dataset but remove it from the pairs plot display. +data_factors <- annotations %>% select(any_of(c(display_vars, "sample"))) +data_pairs <- data_factors %>% select(-sample) + +# Ensure that the variables to be displayed are factors. +for (col in display_vars) { + if(col %in% names(data_pairs)){ + data_pairs[[col]] <- as.factor(data_pairs[[col]]) } } -# Supprimer la colonne "sample" pour cette visualisation -if("sample" %in% names(data_factors)) { - data_factors <- data_factors %>% select(-sample) -} - -# Définition des couleurs personnalisées -custom_colors <- c("#1B9E77", "#D95F02", "#7570B3") # Vert, Orange, Bleu +# Define custom colors for the 'condition' variable. +custom_colors <- c("#1B9E77", "#D95F02", "#7570B3") -# Génération de la matrice de plots avec ggpairs -ggpairs(data_factors, +advanced_plot <- ggpairs(data_pairs, mapping = aes(color = condition), lower = list( - continuous = wrap("points", alpha = 0.7, color = custom_colors[1]), - combo = wrap("box_no_facet", outlier.colour = "red") + continuous = wrap("points", alpha = 0.7, size = 2), + combo = wrap("box_no_facet", outlier.colour = "red", size = 0.5) ), diag = list( continuous = wrap("densityDiag", alpha = 0.5, fill = custom_colors[2]) ), upper = list( - continuous = wrap("cor", size = 5, color = "black") + continuous = wrap("cor", size = 4, color = "black") ) ) + + scale_color_manual(values = custom_colors) + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), axis.text.y = element_text(size = 10), - legend.position = "bottom") + legend.position = "bottom") + + ggtitle("Advanced Pairs Plot of Selected Variables") +advanced_plot + + +``` + +4. Frequency Distributions of Categorical Variables (Tabset) +The following section uses R Markdown tabsets to display the frequency histograms of each categorical variable in separate tabs. +This helps in quickly verifying that each level of a variable is represented by a similar number of samples—a key point for balanced downstream analyses. + +<!-- To enable tabbed sections, add {.tabset} to the header --> +Frequency Distributions for Categorical Variables {.tabset} +condition + +```{r} +ggplot(annotations, aes(x = condition)) + + geom_bar(fill = "blue", alpha = 0.7) + + labs(title = "Frequency Distribution: condition", + x = "Condition", y = "Count") + + theme_minimal() + +``` + +sample +```{r} +ggplot(annotations, aes(x = sample)) + + geom_bar(fill = "steelblue", alpha = 0.7) + + labs(title = "Frequency Distribution: sample", + x = "Sample ID", y = "Count") + + theme_minimal() + +``` +animal +```{r} +ggplot(annotations, aes(x = animal)) + + geom_bar(fill = "blue", alpha = 0.7) + + labs(title = "Frequency Distribution: animal", + x = "Animal", y = "Count") + + theme_minimal() + +``` + +experiment +```{r} +ggplot(annotations, aes(x = experiment)) + + geom_bar(fill = "blue", alpha = 0.7) + + labs(title = "Frequency Distribution: experiment", + x = "Experiment", y = "Count") + + theme_minimal() + +``` +extraction +```{r} +ggplot(annotations, aes(x = extraction)) + + geom_bar(fill = "blue", alpha = 0.7) + + labs(title = "Frequency Distribution: extraction", + x = "Extraction", y = "Count") + + theme_minimal() + ```