Data Exploration NES (1/2) - 2018
Joffrey JOUMAA
August 22, 2023
Source:vignettes/data_exploration_2018.Rmd
data_exploration_2018.Rmd
This document aims at exploring the dataset of 4 individuals in 2018.
For that purpose, we need first to load the ontodive
package to load data.
Let’s have a look at what’s inside
data_nes$data_2018
:
# list structure
str(data_nes$year_2018, max.level = 1, give.attr = F, no.list = T)
## $ ind_2018070:Classes 'data.table' and 'data.frame': 22393 obs. of 29 variables:
## $ ind_2018072:Classes 'data.table' and 'data.frame': 29921 obs. of 29 variables:
## $ ind_2018074:Classes 'data.table' and 'data.frame': 38608 obs. of 29 variables:
## $ ind_2018080:Classes 'data.table' and 'data.frame': 19028 obs. of 29 variables:
A list of 4
data.frames
, one for each seal
For convenience, we aggregate all 4 individuals into one dataset.
# combine all individuals
data_2018 <- rbindlist(data_nes$year_2018)
# display
DT::datatable(data_2018[sample.int(.N, 10), ], options = list(scrollX = T))
Summary
# raw_data
data_2018[, .(
nb_days_recorded = uniqueN(as.Date(date)),
nb_dives = .N,
maxdepth_mean = mean(maxdepth),
dduration_mean = mean(dduration),
botttime_mean = mean(botttime),
pdi_mean = mean(pdi, na.rm = T)
), by = .id] %>%
sable(
caption = "Summary diving information relative to each 2018 individual",
digits = 2
)
.id | nb_days_recorded | nb_dives | maxdepth_mean | dduration_mean | botttime_mean | pdi_mean |
---|---|---|---|---|---|---|
ind_2018070 | 232 | 22393 | 305.52 | 783.27 | 243.22 | 109.55 |
ind_2018072 | 341 | 29921 | 357.86 | 876.96 | 278.02 | 104.90 |
ind_2018074 | 372 | 38608 | 250.67 | 686.25 | 291.89 | 302.77 |
ind_2018080 | 215 | 19028 | 296.50 | 867.69 | 339.90 | 103.51 |
Very nice dataset :)
Some explanatory plots
Missing values
# build dataset to check for missing values
dataPlot <- melt(data_2018[, .(.id, is.na(.SD)), .SDcol = -c(
".id",
"divenumber",
"divetype",
"date",
"phase",
"lat",
"lon"
)])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable", ".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(
values = c("white", "black"),
labels = c("Real", "Missing")
) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
So far so good, only few variables seems to have missing values:
# table with percent
table_inter <- data_2018[, lapply(.SD, function(x) {
round(length(x[is.na(x)]) * 100 / length(x), 1)
}), .SDcol = -c(
".id",
"divenumber",
"divetype",
"date",
"phase",
"lat",
"lon"
)]
# find which are different from 0
cond_inter <- sapply(table_inter, function(x) {
x == 0
})
# display the percentages that are over 0
table_inter[, which(cond_inter) := NULL] %>%
sable(caption = "Percentage of missing values per columns having missing values!") %>%
scroll_box(width = "100%")
driftrate | lightatsurf | dist_dep | ssh | psu | vel | temp | bathy |
---|---|---|---|---|---|---|---|
0.5 | 26.3 | 35.1 | 35.1 | 35.1 | 35.1 | 35.1 | 35.1 |
Outliers
Ok, let’s have a look at all the data. But first, we have to remove outliers. Some of them are quiet easy to spot looking at the distribution of dive duration:
ggplot(
rbind(
copy(data_2018)[, state := "Before"],
copy(data_2018)[dduration < 3000, ][, state := "After"]
) %>%
.[, state := factor(state, levels = c("Before", "After"))],
aes(x = dduration, fill = .id)
) +
geom_histogram(show.legend = FALSE) +
geom_vline(xintercept = 3000, linetype = "longdash") +
facet_grid2(state ~ .id,
scales = "free",
independent = "x"
) +
labs(y = "# of dives", x = "Dive duration (s)") +
theme_jjo()
It seems much better, so let’s remove any rows with
dduration
> 3000 sec.
# filter data
data_2018_filter <- data_2018[dduration < 3000, ]
# nbrow removed
data_2018[dduration >= 3000, .(nb_row_removed = .N), by = .id] %>%
sable(caption = "# of rows removed by 2018-individuals")
.id | nb_row_removed |
---|---|
ind_2018070 | 3 |
ind_2018072 | 1 |
ind_2018074 | 33 |
Check day and night
# let's first average `lightatsurf` by individuals, day since departure and hour
dataPlot <- data_2018[, .(lightatsurf = median(lightatsurf)),
by = .(.id, day_departure, date = as.Date(date), hour = hour(date))
]
# display the result
ggplot(dataPlot, aes(x = day_departure, y = hour, fill = lightatsurf)) +
geom_tile() +
facet_grid(.id ~ .) +
theme_jjo() +
labs(
x = "# of days since departure",
y = "Hour",
fill = "Light level at the surface"
) +
theme(legend.position = c("bottom"))
# let's first average `lightatsurf` by individuals, day since departure and hour
dataPlot <- data_2018[, .(lightatsurf = median(lightatsurf)),
by = .(.id,
day_departure,
date = as.Date(date),
hour = hour(date),
phase
)
]
# display the result
ggplot(dataPlot, aes(x = day_departure, y = hour, fill = phase)) +
geom_tile() +
facet_grid(.id ~ .) +
theme_jjo() +
labs(
x = "# of days since departure",
y = "Hour",
fill = "Day time and night time as detected by the `cal_phase_day` function"
) +
theme(legend.position = c("bottom"))
All Variables
names_display <- names(data_2018_filter[, -c(
".id",
"date",
"divenumber",
"divetype",
"day_departure",
"phase",
"lat",
"lon",
"dist_dep",
"sp"
)])
# calulate the median of driftrate for each day
median_driftrate <- data_2018[divetype == "2: drift",
.(driftrate = quantile(driftrate, 0.5)),
by = .(date = as.Date(date), .id)
]
# let's identity when the smooth changes sign
changes_driftrate <- median_driftrate %>%
.[, .(
y_smooth = predict(loess(driftrate ~ as.numeric(date), span = 0.25)),
date
), by = .id] %>%
.[c(FALSE, diff(sign(y_smooth)) != 0), ]
for (i in names_display) {
cat("####", i, "{.unlisted .unnumbered} \n")
if (i == "driftrate") {
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i), divetype)],
id.vars = c(".id", "date", "divetype")
),
aes(
x = as.Date(date),
y = value,
col = divetype
)
) +
geom_point(
alpha = 1 / 10,
size = .5
) +
geom_vline(
data = changes_driftrate,
aes(xintercept = date),
linetype = 2
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = "Drift Rate 'm/s", col = "Dive Type") +
theme_jjo() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
) +
guides(colour = guide_legend(override.aes = list(
size = 7,
alpha = 1
)))
)
} else {
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i))],
id.vars = c(".id", "date")
),
aes(
x = as.Date(date),
y = value,
col = .id
)
) +
geom_point(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
geom_vline(
data = changes_driftrate,
aes(xintercept = date),
linetype = 2
) +
geom_vline(data = dataVline, aes(xintercept = as.Date(date)), colour = "black", linetype = 2) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
cat("\n \n")
}
The vertical dashed lines represent changes in buoyancy (see
vignette("buoyancy_detect")
for more information)
Few questions, that I should look into it:
- is the bimodal distribution of
dduration
,desctime
due to nycthemeral migration?- is the bimodal distribution of
descrate
(especially forind2018070
andind_2018072
) due to drift dive?- is
lightatbott
could be used to identify bioluminescence, cause it seems there is a lot going on at the bottom?- are the variations observed for
lightatsurf
is due to moon cycle?- not sure why is there a bimodal distribution of
tempatbott
!driftrate
that one is awesome! Thanks todivetype
we can clearly see a pattern of how driftrate (and so buoyancy) change according time.
# same plot with a colored for the phase of the day
for (i in names_display) {
cat("####", i, "{-} \n")
print(
ggplot(
data = melt(data_2018_filter[, .(.id, date, get(i), phase)],
id.vars = c(
".id",
"date",
"phase"
)
),
aes(
x = as.Date(date),
y = value,
col = phase
)
) +
geom_point(
alpha = 1 / 10,
size = .5
) +
geom_vline(
data = changes_driftrate,
aes(xintercept = date),
linetype = 2
) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x = "Date", y = i) +
theme_jjo() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
) +
guides(colour = guide_legend(override.aes = list(
size = 7,
alpha = 1
)))
)
cat("\n \n")
}
The vertical dashed lines represent changes in buoyancy (see
vignette("buoyancy_detect")
for more information)
All Variables during the first month
for (i in names_display) {
cat("####", i, "{.unlisted .unnumbered} \n")
print(
ggplot(
data = melt(
data_2018_filter[
day_departure < 32,
.(.id, day_departure, get(i))
],
id.vars = c(".id", "day_departure")
),
aes(
x = day_departure,
y = value,
color = .id,
group = day_departure
)
) +
geom_boxplot(
show.legend = FALSE,
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
labs(x = "# days since departure", y = i) +
theme_jjo()
)
cat("\n \n")
}
for (i in names_display) {
cat("####", i, "{.unlisted .unnumbered} \n")
print(
ggplot(
data = melt(
data_2018_filter[
day_departure < 32,
.(.id, day_departure, get(i), phase)
],
id.vars = c(".id", "day_departure", "phase")
),
aes(
x = day_departure,
y = value,
color = phase,
group = interaction(day_departure, phase),
)
) +
geom_boxplot(
alpha = 1 / 10,
size = .5
) +
facet_wrap(. ~ .id, scales = "free") +
labs(x = "# days since departure", y = i) +
theme_jjo() +
theme(legend.position = "bottom")
)
cat("\n \n")
}
Correlation
Can we find nice correlation?
# compute correlation
corr_2018 <- round(cor(data_2018_filter[, names_display, with = F],
use = "pairwise.complete.obs"
), 1)
# replace NA value by 0
corr_2018[is.na(corr_2018)] <- 0
# compute p_values
corr_p_2018 <- cor_pmat(data_2018_filter[, names_display, with = F])
# replace NA value by 0
corr_p_2018[is.na(corr_p_2018)] <- 1
# display
ggcorrplot(
corr_2018,
p.mat = corr_p_2018,
hc.order = TRUE,
method = "circle",
type = "lower",
ggtheme = theme_jjo(),
sig.level = 0.05,
colors = c("#00AFBB", "#E7B800", "#FC4E07")
)
Another way to see it:
# flatten correlation matrix
cor_result_2018 <- flat_cor_mat(corr_2018, corr_p_2018)
# keep only the one above .7
cor_result_2018[cor >= .7, ][order(-abs(cor))] %>%
sable(caption = "Pairwise correlation above 0.75 and associated p-values")
row | column | cor | p |
---|---|---|---|
maxdepth | asctime | 0.8 | 0 |
maxdepth | dduration | 0.7 | 0 |
maxdepth | desctime | 0.7 | 0 |
dduration | desctime | 0.7 | 0 |
dduration | asctime | 0.7 | 0 |
I guess nothing unexpected here ;)
Dive Type
# dataset to plot proportional area plot
dataPlot <- data_2018_filter %>%
.[, .(sum_id_days = .N), by = .(.id, day_departure, divetype)] %>%
.[, prop := sum_id_days / sum(sum_id_days), by = .(.id, day_departure)]
# area plot
ggplot(dataPlot, aes(
x = as.numeric(day_departure),
y = prop,
fill = as.character(divetype)
)) +
geom_area(alpha = 0.6, size = 1, position = "stack", stat = "identity") +
facet_wrap(.id ~ ., scales = "free") +
scale_y_continuous(labels = scales::percent) +
theme_jjo() +
theme(legend.position = "bottom") +
labs(
x = "# of days since departure",
y = "Proportion of dives",
fill = "Dive types"
)
Dive duration vs. Maximum depth
# plot
ggplot(data = data_2018_filter, aes(y = dduration, x = maxdepth, col = .id)) +
geom_point(size = .5, alpha = .1, show.legend = FALSE) +
facet_wrap(.id ~ .) +
labs(x = "Maximum depth (m)", y = "Dive duration (s)") +
theme_jjo()
# plot
ggplot(data = data_2018_filter, aes(
y = dduration,
x = maxdepth,
col = divetype
)) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))) +
labs(x = "Maximum depth (m)", y = "Dive duration (s)") +
theme_jjo() +
theme(legend.position = "bottom")
# plot
ggplot(
data = data_2018_filter[, prop_track := (day_departure * 100) / max(day_departure), by = .id],
aes(y = dduration, x = maxdepth, col = prop_track)
) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
labs(
x = "Maximum depth (m)",
y = "Dive duration (s)",
col = "Proportion of completed track (%)"
) +
scale_color_continuous(type = "viridis") +
theme_jjo() +
theme(legend.position = "bottom")
# plot
ggplot(data = data_2018_filter, aes(y = dduration, x = maxdepth, col = phase)) +
geom_point(size = .5, alpha = .1) +
facet_wrap(.id ~ .) +
guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))) +
labs(
x = "Maximum depth (m)",
y = "Dive duration (s)",
col = "Phases of the day"
) +
theme_jjo() +
theme(legend.position = "bottom")
There seems to be a patch for high depths (especially visible for
ind2018070
), but I don’t know what it could be linked to…
Drift Rate
In the following graphs:
driftrate
is calculated using onlydivetype == "2: drift"
- whereas all the others variables are calculated all dives considered
# build dataset
dataPlot <- data_2018_filter[divetype == "2: drift",
# median drift rate for drift dive
.(driftrate = median(driftrate, na.rm = T)),
by = .(.id, day_departure)
] %>%
.[
data_2018_filter[,
.(
# median dive duration all dives considered
dduration = median(dduration, na.rm = T),
# median max depth all dives considered
maxdepth = median(maxdepth, na.rm = T),
# median bottom dives all dives considered
botttime = median(botttime, na.rm = T)
),
by = .(.id, day_departure)
],
on = c(".id", "day_departure")
]
# plot
ggplot(dataPlot, aes(x = botttime, y = driftrate, col = .id)) +
geom_point(size = .5, alpha = .5) +
geom_smooth(method = "lm") +
guides(color = "none") +
facet_wrap(.id ~ .) +
scale_x_continuous(limits = c(0, 700)) +
labs(
x = "Daily median Bottom time (s)",
y = "Daily median drift rate (m.s-1)"
) +
theme_jjo()
# plot
ggplot(dataPlot, aes(x = maxdepth, y = driftrate, col = .id)) +
geom_point(size = .5, alpha = .5) +
geom_smooth(method = "lm") +
guides(color = "none") +
facet_wrap(.id ~ .) +
labs(
x = "Daily median Maximum depth (m)",
y = "Daily median drift rate (m.s-1)"
) +
theme_jjo()
# plot
ggplot(dataPlot, aes(x = dduration, y = driftrate, col = .id)) +
geom_point(size = .5, alpha = .5) +
geom_smooth(method = "lm") +
guides(color = "none") +
facet_wrap(.id ~ .) +
labs(
x = "Daily median Dive duration (s)",
y = "Daily median drift rate (m.s-1)"
) +
theme_jjo()
Behavioral Aerobic Dive Limit (bADL)
Cook et al (2008)
# dive duration vs pdi by days
ggplot(data = data_2018_filter[pdi < 300, ], aes(
x = dduration,
y = pdi,
color = .id,
group = dduration,
fill = "none"
)) +
geom_boxplot(show.legend = FALSE, outlier.alpha = 0.05, alpha = 0) +
labs(x = "Dive duration (s)", y = "Post-dive duration (s)") +
facet_wrap(. ~ .id, scales = "free_x") +
theme_jjo()
# dive duration vs pdi by days
ggplot(data = data_2018_filter[pdi < 300, ], aes(
x = dduration,
y = pdi,
color = .id
)) +
geom_point(show.legend = FALSE, alpha = 0.05) +
geom_smooth(
method = "gam",
show.legend = FALSE,
col = "black",
linetype = "dashed"
) +
labs(x = "Dive duration (s)", y = "Post-dive duration (s)") +
facet_wrap(. ~ .id, scales = "free_x") +
theme_jjo()
# dive duration vs pdi by days
ggplot(
data = data_2018_filter[pdi < 300, .(.id, pdi_ratio = pdi / dduration, day_departure)],
aes(
x = day_departure,
y = pdi_ratio,
color = .id,
group = day_departure,
fill = "none"
)
) +
geom_boxplot(
show.legend = FALSE,
outlier.alpha = 0.05,
alpha = 0
) +
labs(x = "# days since departure", y = "Post-dive / Dive duration ratio") +
facet_wrap(. ~ .id, scales = "free_x") +
# zoom
coord_cartesian(ylim = c(0, 0.4)) +
theme_jjo()
Shero et al. (2018)
Based on Shero et al. (2018), we decided to look at the bADL as the 95th percentile of dive duration each day, for those with \(n \geq 50\). This threshold was chosen following this figure:
ggplot(
data_2018_filter[, .(nb_dives = .N),
by = .(.id, day_departure)
],
aes(x = nb_dives, fill = .id)
) +
geom_histogram(show.legend = FALSE) +
geom_vline(xintercept = 50, linetype = "dashed") +
facet_grid(. ~ .id) +
labs(y = "# of days", x = "# of dives per day") +
theme_jjo()
# select day that have at least 50 dives
days_to_keep <- data_2018_filter[,
.(nb_dives = .N),
by = .(.id, day_departure)
] %>%
.[nb_dives >= 50, ]
# keep only those days
data_2018_filter_complete_day <- merge(data_2018_filter,
days_to_keep,
by = c(".id", "day_departure")
)
# data plot
dataPlot <- data_2018_filter_complete_day[divetype == "1: foraging",
.(badl = quantile(dduration, 0.95)),
by = .(.id, day_departure)
]
# combine two datasets to be able to use a second axis
# https://stackoverflow.com/questions/49185583/two-y-axes-with-different-scales-for-two-datasets-in-ggplot2
dataMegaPlot <- rbind(
data_2018_filter_complete_day[divetype == "2: drift"] %>%
.[, .(
w = .id,
y = driftrate,
x = day_departure,
z = "second_plot"
)],
dataPlot[, .(
w = .id,
# tricky one
y = (badl / 1000) - 1,
x = day_departure,
z = "first_plot"
)]
)
# plot
ggplot() +
geom_point(
data = dataMegaPlot[z == "second_plot", ],
aes(x = x, y = y),
alpha = 1 / 10,
size = 0.5,
color = "grey40",
show.legend = FALSE
) +
geom_path(
data = dataMegaPlot[z == "first_plot", ],
aes(x = x, y = y, color = w),
show.legend = FALSE
) +
scale_y_continuous(
# Features of the first axis
name = "Drift rate (m/s)",
# Add a second axis and specify its features
sec.axis = sec_axis(~ (. * 1000) + 1000,
name = "Behavioral Aerobic Dive Limit (s)"
)
) +
labs(x = "# days since departure") +
facet_wrap(w ~ .) +
theme_jjo()
Looking at this graph, I want to believe that there is some kind of relationship between the bADL as defined by Shero et al. (2018) and the drift rate (and so buyoancy).
# get badl
dataplot_1 <- data_2018_filter_complete_day[,
.(badl = quantile(dduration, 0.95)),
by = .(.id, day_departure)
]
# get driftrate
dataplot_2 <- data_2018_filter_complete_day[divetype == "2: drift",
.(driftrate = median(driftrate)),
by = .(.id, day_departure)
]
# merge
dataPlot <- merge(dataplot_1,
dataplot_2,
by = c(".id", "day_departure"),
all = TRUE
)
# plot
ggplot(data = dataPlot, aes(x = badl, y = driftrate, col = .id)) +
geom_point(show.legend = FALSE) +
facet_wrap(.id ~ ., scales = "free") +
theme_jjo()
# ind_2018070
plot_ly(
x = dataPlot[.id == "ind_2018070", badl],
y = dataPlot[.id == "ind_2018070", day_departure],
z = dataPlot[.id == "ind_2018070", driftrate],
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
color = dataPlot[.id == "ind_2018070", day_departure]
) %>%
layout(scene = list(
xaxis = list(title = "Behavioral ADL"),
yaxis = list(title = "# days since departure"),
zaxis = list(title = "Drift rate (m/s)")
))
# ind_2018072
plot_ly(
x = dataPlot[.id == "ind_2018072", badl],
y = dataPlot[.id == "ind_2018072", day_departure],
z = dataPlot[.id == "ind_2018072", driftrate],
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
color = dataPlot[.id == "ind_2018072", day_departure]
) %>%
layout(scene = list(
xaxis = list(title = "Behavioral ADL"),
yaxis = list(title = "# days since departure"),
zaxis = list(title = "Drift rate (m/s)")
))
# ind_2018074
plot_ly(
x = dataPlot[.id == "ind_2018074", badl],
y = dataPlot[.id == "ind_2018074", day_departure],
z = dataPlot[.id == "ind_2018074", driftrate],
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
color = dataPlot[.id == "ind_2018074", day_departure]
) %>%
layout(scene = list(
xaxis = list(title = "Behavioral ADL"),
yaxis = list(title = "# days since departure"),
zaxis = list(title = "Drift rate (m/s)")
))
# ind_2018080
plot_ly(
x = dataPlot[.id == "ind_2018080", badl],
y = dataPlot[.id == "ind_2018080", day_departure],
z = dataPlot[.id == "ind_2018080", driftrate],
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
color = dataPlot[.id == "ind_2018080", day_departure]
) %>%
layout(scene = list(
xaxis = list(title = "Behavioral ADL"),
yaxis = list(title = "# days since departure"),
zaxis = list(title = "Drift rate (m/s)")
))