Skip to content
Snippets Groups Projects
Commit 68016fda authored by Antaaa28's avatar Antaaa28
Browse files

maj

parent 0e23fbdb
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,22 @@ output: html_document
---
```{r input_parameters, include=FALSE}
# Chemin vers le fichier de design
design_file <- "C:/Users/User/Desktop/projet_visualisation/design_WS3.csv"
# Liste des noms de variables à considérer comme catégorielles.
categorical_vars <- c("sample", "condition", "animal", "experiment", "extraction")
# Liste des noms de variables à considérer comme quantitatives.
quantitative_vars <- c( "volume", "quantity")
# Liste des noms de variables à afficher pour la figure "matrice de plots" (advanced pairs plot).
# Ici, on exclut par exemple 'sample' pour ne pas surcharger la visualisation.
display_vars <- c("condition", "animal", "experiment", "extraction")
```
```{r setup, include=FALSE}
# Global options: hide code, warnings and messages in the final report.
......@@ -27,6 +43,10 @@ if(!require(ComplexHeatmap)) {
}
if(!require(RColorBrewer)) install.packages("RColorBrewer")
if(!require(reshape2)) install.packages("reshape2")
if(!require(DT)) install.packages("DT")
if(!require(plotly)) install.packages("plotly")
if(!require(lsr)) install.packages("lsr")
if(!require(knitr)) install.packages("knitr")
library(GGally)
library(readr)
......@@ -36,6 +56,13 @@ library(corrplot)
library(ComplexHeatmap)
library(RColorBrewer)
library(reshape2)
library(DT)
library(plotly)
library(lsr)
library(knitr)
```
......@@ -78,31 +105,23 @@ cat("Variables catégorielles détectées : ", paste(colnames(cat_data), collaps
```
2.1 Automated Conversion for Correlation Analysis
Note: For correlation analysis, only the quantitative variables are used.
```{r}
data_numeric <- annotations %>%
mutate(
condition = as.numeric(factor(condition)), # Conversion des catégories en indices numériques
animal = as.numeric(factor(animal)),
experiment = as.numeric(as.Date(experiment, format = "%d.%m.%Y")), # Conversion en nombre
extraction = as.numeric(as.Date(extraction, format = "%d.%m.%Y")),
sample = as.numeric(factor(sample)) # Conversion de 'sample' en indice numérique
) %>%
select(-sample) # Exclure 'sample' du calcul de corrélation
str(data_numeric)
condition = as.numeric(factor(condition)), # Conversion de 'condition'
animal = as.numeric(factor(animal)) # Conversion de 'animal'
# 'sample' sera exclue du calcul
) %>%
select(-sample, -experiment, -extraction)
```
3. Visualizations of Correlations and Distributions
3.1 Correlation Matrix with corrplot
Figure: Correlation matrix showing pairwise Pearson correlations among quantitative variables.
```{r}
if(ncol(data_numeric) > 1){
# Calcul de la matrice de corrélation (méthode Pearson)
cor_matrix <- cor(data_numeric, use = "complete.obs", method = "pearson")
# Génération de la figure de corrélation
corrplot(cor_matrix, method = "circle", type = "lower",
tl.col = "black", tl.cex = 0.8,
col = colorRampPalette(c("blue", "white", "red"))(200))
......@@ -111,6 +130,7 @@ if(ncol(data_numeric) > 1){
}
```
3.2 Heatmap with ComplexHeatmap
......@@ -143,8 +163,13 @@ if(ncol(num_data) > 1){
3.4 Distribution of Numeric Variables
Figure: Histograms with density curves for each quantitative variable.
```{r}
numeric_vars <- names(data_numeric)
```{r distribution_numeric}
# Sélectionner uniquement les variables quantitatives qui existent dans data_numeric
numeric_vars <- quantitative_vars[quantitative_vars %in% names(data_numeric)]
# Génération d'un histogramme et d'une courbe de densité pour chaque variable quantitative
for (var in numeric_vars) {
p <- ggplot(data_numeric, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
......@@ -156,6 +181,7 @@ for (var in numeric_vars) {
}
```
3.5 Scatter Plots for Highly Correlated Pairs
......@@ -188,63 +214,80 @@ if(ncol(data_numeric) > 1){
```
3.6 Distribution of Categorical Variables
Figure: Frequency distributions (bar charts) for each categorical variable.
## Frequency Distributions for Categorical Variables {.tabset}
### experiment
```{r}
cat_vars <- names(cat_data)
for (var in cat_vars) {
p <- ggplot(annotations, aes_string(x = var)) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(title = paste("Frequency Distribution of", var),
x = var, y = "Count") +
theme_minimal()
print(p)
}
ggplot(annotations, aes(x = experiment)) +
geom_bar(fill = "purple", alpha = 0.7) +
labs(title = "Distribution: experiment", x = "Experiment", y = "Count") +
theme_minimal()
```
### extraction
```{r}
ggplot(annotations, aes(x = extraction)) +
geom_bar(fill = "orange", alpha = 0.7) +
labs(title = "Distribution: extraction", x = "Extraction", y = "Count") +
theme_minimal()
```
3.7 Sample Visualizations
3.7.1 Number of Samples by Condition
```{r sample_barplot, echo=FALSE, fig.height=5, fig.width=7}
# Bar plot: Nombre d'échantillons par condition
p1 <- ggplot(annotations, aes(x = condition)) +
geom_bar(fill = "steelblue", alpha = 0.7) +
labs(title = "Number of Samples by Condition",
x = "Condition", y = "Count") +
theme_minimal()
print(p1)
```
3.7 Detailed Visualization of the 'sample' Variable
Even though the unique sample identifiers are not necessarily used in every figure, it is important to retain and examine their distribution.
Here we provide a dedicated bar chart for the sample variable.
# Table interactive: Liste des identifiants d'échantillons associées aux variables condition et animal
```{r}
if("sample" %in% names(annotations)){
# Ensure 'sample' is treated as a factor and preserve its order.
annotations$sample <- factor(annotations$sample, levels = unique(annotations$sample))
# Bar chart for 'sample'
p <- ggplot(annotations, aes(x = sample)) +
geom_bar(fill = "steelblue", alpha = 0.7) +
labs(title = "Frequency Distribution of 'sample'",
x = "Sample ID", y = "Count") +
theme_minimal()
print(p)
} else {
cat("The variable 'sample' does not exist in the dataset.\n")
}
datatable(annotations[, c("sample", "condition", "animal")],
options = list(pageLength = 10),
caption = "Table of Sample Identifiers and Associated Groups")
```
3.8 Advanced Visualization: Improved Pairs Plot
Figure: Advanced pairs plot of selected categorical variables (excluding 'sample') to provide a global view of group relationships.
```{r}
# Create a copy of the data for the pairs plot.
# Retain 'sample' in the dataset but remove it from the pairs plot display.
data_factors <- annotations %>% select(any_of(c(display_vars, "sample")))
data_pairs <- data_factors %>% select(-sample)
# Ensure that the variables to be displayed are factors.
for (col in display_vars) {
if(col %in% names(data_pairs)){
data_pairs[[col]] <- as.factor(data_pairs[[col]])
# Créer une copie des données pour conversion en facteurs
data_factors <- annotations
# Conversion des colonnes spécifiques en facteurs si elles existent
cols_to_factor <- c("condition", "animal", "experiment", "extraction")
for (col in cols_to_factor) {
if (col %in% names(data_factors)) {
data_factors[[col]] <- as.factor(data_factors[[col]])
}
}
# Define custom colors for the 'condition' variable.
custom_colors <- c("#1B9E77", "#D95F02", "#7570B3")
# Supprimer la colonne "sample" pour cette visualisation
if("sample" %in% names(data_factors)) {
data_factors <- data_factors %>% select(-sample)
}
# Définition des couleurs personnalisées
custom_colors <- c("#1B9E77", "#D95F02", "#7570B3") # Palette pour 'condition'
advanced_plot <- ggpairs(data_pairs,
ggpairs(data_factors,
mapping = aes(color = condition),
lower = list(
continuous = wrap("points", alpha = 0.7, size = 2),
......@@ -254,72 +297,150 @@ advanced_plot <- ggpairs(data_pairs,
continuous = wrap("densityDiag", alpha = 0.5, fill = custom_colors[2])
),
upper = list(
continuous = wrap("cor", size = 4, color = "black")
continuous = wrap("cor", size = 5, color = "black")
)
) +
scale_color_manual(values = custom_colors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
legend.position = "bottom") +
ggtitle("Advanced Pairs Plot of Selected Variables")
advanced_plot
legend.position = "bottom")
```
4. Frequency Distributions of Categorical Variables (Tabset)
The following section uses R Markdown tabsets to display the frequency histograms of each categorical variable in separate tabs.
This helps in quickly verifying that each level of a variable is represented by a similar number of samples—a key point for balanced downstream analyses.
## Group Balance Check
<!-- To enable tabbed sections, add {.tabset} to the header -->
Frequency Distributions for Categorical Variables {.tabset}
condition
Vérifier la répartition des échantillons par condition
```{r}
ggplot(annotations, aes(x = condition)) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(title = "Frequency Distribution: condition",
x = "Condition", y = "Count") +
theme_minimal()
condition_counts <- annotations %>%
count(condition) %>%
arrange(desc(n))
print(condition_counts)
```
sample
Vérifier la répartition des échantillons par animal
```{r}
ggplot(annotations, aes(x = sample)) +
geom_bar(fill = "steelblue", alpha = 0.7) +
labs(title = "Frequency Distribution: sample",
x = "Sample ID", y = "Count") +
theme_minimal()
animal_counts <- annotations %>%
count(animal) %>%
arrange(desc(n))
print(animal_counts)
```
animal
Bar plot interactif pour la répartition par animal
```{r}
ggplot(annotations, aes(x = animal)) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(title = "Frequency Distribution: animal",
x = "Animal", y = "Count") +
p_animal <- ggplot(animal_counts, aes(x = reorder(animal, -n), y = n)) +
geom_bar(stat = "identity", fill = "darkgreen", alpha = 0.8) +
labs(title = "Number of Samples by Animal",
x = "Animal",
y = "Count") +
theme_minimal()
ggplotly(p_animal)
```
experiment
Sous-échantillonnage pour équilibrer les groupes par condition
```{r}
ggplot(annotations, aes(x = experiment)) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(title = "Frequency Distribution: experiment",
x = "Experiment", y = "Count") +
theme_minimal()
min_count <- min(condition_counts$n)
balanced_annotations <- annotations %>%
group_by(condition) %>%
sample_n(min_count) %>%
ungroup()
cat("Subsampled to", min_count, "samples per condition.\n")
```
extraction
```{r}
ggplot(annotations, aes(x = extraction)) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(title = "Frequency Distribution: extraction",
x = "Extraction", y = "Count") +
theme_minimal()
```
## 4. Association Matrix: Matrice des liaisons
```{r association_matrix, echo=FALSE, fig.width=8, fig.height=6}
# --- Définition des fonctions d'association ---
# 1. Fonction pour calculer le coefficient de corrélation ratio (η) entre une variable numérique et une variable catégorielle
correlationRatio <- function(x, group) {
df <- data.frame(x, group)
df <- df[complete.cases(df), ]
x <- df$x
group <- df$group
grandMean <- mean(x)
# Somme des carrés entre les groupes
ssBetween <- tapply(x, group, function(xg) length(xg) * (mean(xg) - grandMean)^2)
ssBetween <- sum(ssBetween)
# Somme des carrés dans les groupes
ssWithin <- tapply(x, group, function(xg) sum((xg - mean(xg))^2))
ssWithin <- sum(ssWithin)
eta2 <- ssBetween / (ssBetween + ssWithin)
return(sqrt(eta2))
}
# 2. Fonction pour calculer le V de Cramér à partir d'un tableau de contingence
computeCramersV <- function(tab) {
# Calcul du chi2 sans correction de continuité
chi2 <- suppressWarnings(chisq.test(tab, correct = FALSE)$statistic)
n <- sum(tab)
phi2 <- chi2 / n
r <- nrow(tab)
c <- ncol(tab)
# Correction pour éviter des valeurs négatives
phi2corr <- max(0, phi2 - ((c - 1) * (r - 1)) / (n - 1))
rcorr <- r - ((r - 1)^2) / (n - 1)
ccorr <- c - ((c - 1)^2) / (n - 1)
return(sqrt(phi2corr / min((ccorr - 1), (rcorr - 1))))
}
# 3. Fonction pour calculer l'association entre deux variables :
# - Si les deux variables sont numériques : corrélation de Pearson.
# - Si les deux variables sont catégorielles : V de Cramér (via computeCramersV).
# - Si l'une est numérique et l'autre catégorielle : corrélation ratio (η).
computeAssociation <- function(x, y) {
if (is.numeric(x) && is.numeric(y)) {
return(cor(x, y, use = "complete.obs"))
} else if (is.factor(x) && is.factor(y)) {
tab <- table(x, y)
return(computeCramersV(tab))
} else if (is.numeric(x) && is.factor(y)) {
return(correlationRatio(x, y))
} else if (is.factor(x) && is.numeric(y)) {
return(correlationRatio(y, x))
} else {
return(NA)
}
}
# 4. Fonction pour construire la matrice des liaisons pour toutes les variables d'un jeu de données
associationMatrix <- function(data) {
vars <- names(data)
nvar <- length(vars)
M <- matrix(NA, nrow = nvar, ncol = nvar, dimnames = list(vars, vars))
for (i in seq_len(nvar)) {
for (j in seq_len(nvar)) {
M[i, j] <- computeAssociation(data[[i]], data[[j]])
}
}
return(M)
}
# --- Préparation du jeu de données pour la matrice d'association ---
# On utilise les variables quantitatives (par exemple, "volume", "quantity") et
# les variables catégorielles définies dans display_vars (excluant "sample").
mixed_vars <- c(quantitative_vars, setdiff(display_vars, "sample"))
mixed_data <- annotations %>% select(all_of(mixed_vars))
# S'assurer que les variables catégorielles sont bien de type factor
for (var in display_vars) {
if (var %in% names(mixed_data)) {
mixed_data[[var]] <- as.factor(mixed_data[[var]])
}
}
# Calcul de la matrice des liaisons
assoc_mat <- associationMatrix(mixed_data)
# Affichage de la matrice sous forme de tableau
knitr::kable(assoc_mat, digits = 2, caption = "Association Matrix (Matrice des liaisons)")
```
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment