Supplementary Simulation Code Longmixr Hagenberg
Supplementary Simulation Code Longmixr Hagenberg
Contents
1 Data simulation 1
1.1 Mixed model . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
1.2 IRT for categorical data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12
1.3 Create one data source . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18
1 Data simulation
The data is simulated in two steps:
1. generate a continuous variable from a mixed model
2. use this variable as theta (latent variable or ability) in a graded response model (GRM)
The theta in the GRM should follow a N(0, 1) distribution. Therefore, the values from step 1 are mapped
to the quantiles of a N(0, 1) function. For every questionnaire, one mixed model is used. In the GRM, the
number of the items per questionnaire, the discrimination of each item and the difficulty/ threshold for every
class (the value of theta at which the probability to select the next class is 50%) can be varied.
In total, 50, 100, 200, 500 and 1000 individuals aged between 18 - 65 with 4 time points in 4 equally sized
groups are simulated with 3 questionnaires:
• 15 items with 5 levels each
• 20 items with 4 levels each
• 2 items with 5 levels each and 3 continuous variables
Additionally, a cross-sectional continuous variable is simulated that could be used to compare the groups
after clustering.
1
age_df <- data.frame(ID = 1:1000,
age_visit_1 = round(runif(200, min = 18, max = 65)))
# adapted from
# https://stats.stackexchange.com/questions/394092/simulate-longitudinal-curvilinear-convergent-data-in-
if (!is.null(age_info)) {
data <- dplyr::left_join(data, age_info, by = "ID")
}
# remove again the age because I add it later to the merged data
if (!is.null(age_info)) {
data <- data %>%
select(-age_visit_1)
2
}
data
}
Normalisation function:
quantile_normalisation <- function(data) {
# check for NAs
data <- data.frame(value = data)
index_use <- which(!is.na(data$value) &
!is.nan(data$value))
if (length(index_use) == 0) {
stop("No data provided")
}
1.1.1 2 groups
3
-8.5, 0.9),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit + I(visitˆ2) + I(visitˆ3)) + age_visit_1),
z_formula = formula(~ 1)
))
The third questionnaire consists of both categorical variables and continuous variables. The continuous
variables are directly modeled with the mixed model.
group_2_latent_3 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 2,
# intercept, groupB, visit,
# age_visit_1, groupB:visit
betas = c(0, 6.5, 0.8,
-0.2, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 0.5
))
4
group_2_continuous_3_1 <- group_2_continuous_3_1 %>%
map(function(data) {
data %>%
mutate(y_norm = quantile_normalisation(y))
})
group_2_continuous_3_2 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 2,
# intercept, groupB, visit,
# age_visit_1, groupB:visit
betas = c(3, 0, 0,
-0.05, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
5
Create one continuous cross-sectional variable:
group_2_continuous_cr <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 2,
# intercept, groupB, age_visit_1
betas = c(3, -2, 0.05),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
1.1.2 3 groups
6
0.9, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit + I(visitˆ2) + I(visitˆ3)) + age_visit_1),
z_formula = formula(~ 1)
))
The third questionnaire consists of both categorical variables and continuous variables. The continuous
variables are directly modeled with the mixed model.
group_3_latent_3 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 3,
# intercept, groupB, groupC, visit,
# age_visit_1, groupB:visit, groupC:visit
betas = c(0, 8, -2, 0.8,
-0.2, 0, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 0.5
))
7
group_3_continuous_3_1 <- group_3_continuous_3_1 %>%
map(function(data) {
data %>%
mutate(y_norm = quantile_normalisation(y))
})
group_3_continuous_3_2 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 3,
# intercept, groupB, groupC, visit,
# age_visit_1, groupB:visit, groupC:visit
betas = c(3, 0, 0.1, 0,
-0.05, 0, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
8
Create one continuous cross-sectional variable:
group_3_continuous_cr <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 3,
# intercept, groupB, groupC, age_visit_1
betas = c(3, -2, 2.5, 0.05),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
1.1.3 4 groups
9
0.9, 0, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit + I(visitˆ2) + I(visitˆ3)) + age_visit_1),
z_formula = formula(~ 1)
))
The third questionnaire consists of both categorical variables and continuous variables. The continuous
variables are directly modeled with the mixed model.
group_4_latent_3 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 4,
# intercept, groupB, groupC, groupD, visit,
# age_visit_1, groupB:visit, groupC:visit, groupD:visit
betas = c(0, 8, -2, 0, 0.8,
-0.2, 0, 0, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 0.5
))
10
group_4_continuous_3_1 <- group_4_continuous_3_1 %>%
map(function(data) {
data %>%
mutate(y_norm = quantile_normalisation(y))
})
group_4_continuous_3_2 <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 4,
# intercept, groupB, groupC, groupD, visit,
# age_visit_1, groupB:visit, groupC:visit, groupD:visit
betas = c(3, 0, 0.1, -0.2, 0,
-0.05, 0, 0, 0),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group * (visit) + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
11
Create one continuous cross-sectional variable:
group_4_continuous_cr <- c(50, 100, 200, 500, 1000) %>%
set_names() %>%
map(~simulate_longi(
n = .x,
n_groups = 4,
# intercept, groupB, groupC, groupD, age_visit_1
betas = c(3, -2, 2.5, -1.3, 0.05),
random_var = c(D11 = 0.0001),
x_formula = formula(~ group + age_visit_1),
z_formula = formula(~ 1),
sigma = 1
))
1.2.1 2 groups
1.2.1.1 Questionnaire 1 Create questionnaire 1 with 15 items and 5 levels each:
group_2_irt_1 <- group_2_latent_1 %>%
map(function(data) {
simIrt(
theta = data$y_norm,
params = cbind(
a = c(0.5, 1, 1.5, 1.5, 1, 1, 0.75, 0.5, 2, 0.9, 1.3, 0.8, 1.4, 1.1, 1),
b1 = c(-2, -2, -2, -2.5, -2, -1.5, -1, -2.75, -1.8, -2.3, -2.6, -2.2, -2.2, -1.7, -1.4),
b2 = c(-1, -1, -1, -2, -0.5, 0, 1, -1.5, -1.2, -1.7, -1.4, -1.1, -1.1, -0.6, -0.1),
b3 = c(0, 0, 0, -1, 1, 1.5, 2, 1.5, 0.5, 2, 1.3, 1.4, 1.3, 0.8, 0.9),
b4 = c(1.5, 1.5, 1.5, 1, 2, 2, 3, 2, 1.8, 2.5, 1.9, 2.5, 2.1, 1.6, 1.9)
),
mod = "grm"
12
)
})
13
group_2_irt_3 <- group_2_latent_3 %>%
map(function(data) {
simIrt(
theta = data$y_norm,
params = cbind(
a = c(1, 1.5),
b1 = c(-1.9, -1.3),
b2 = c(-0.4, -0.1),
b3 = c(0.9, 0.5),
b4 = c(1.6, 1.8)
),
mod = "grm"
)
})
1.2.2 3 groups
1.2.2.1 Questionnaire 1 Create questionnaire 1 with 15 items and 5 levels each:
group_3_irt_1 <- group_3_latent_1 %>%
map(function(data) {
simIrt(
theta = data$y_norm,
params = cbind(
a = c(0.5, 1, 1.5, 1.5, 1, 1, 0.75, 0.5, 2, 0.9, 1.3, 0.8, 1.4, 1.1, 1),
b1 = c(-2, -2, -2, -2.5, -2, -1.5, -1, -2.75, -1.8, -2.3, -2.6, -2.2, -2.2, -1.7, -1.4),
b2 = c(-1, -1, -1, -2, -0.5, 0, 1, -1.5, -1.2, -1.7, -1.4, -1.1, -1.1, -0.6, -0.1),
14
b3 = c(0, 0, 0, -1, 1, 1.5, 2, 1.5, 0.5, 2, 1.3, 1.4, 1.3, 0.8, 0.9),
b4 = c(1.5, 1.5, 1.5, 1, 2, 2, 3, 2, 1.8, 2.5, 1.9, 2.5, 2.1, 1.6, 1.9)
),
mod = "grm"
)
})
15
1.2.2.3 Questionnaire 3 Create questionnaire 3 with 2 items and 5 levels each:
group_3_irt_3 <- group_3_latent_3 %>%
map(function(data) {
simIrt(
theta = data$y_norm,
params = cbind(
a = c(1, 1.5),
b1 = c(-1.9, -1.3),
b2 = c(-0.4, -0.1),
b3 = c(0.9, 0.5),
b4 = c(1.6, 1.8)
),
mod = "grm"
)
})
1.2.3 4 groups
1.2.3.1 Questionnaire 1 Create questionnaire 1 with 15 items and 5 levels each:
group_4_irt_1 <- group_4_latent_1 %>%
map(function(data) {
simIrt(
theta = data$y_norm,
params = cbind(
a = c(0.5, 1, 1.5, 1.5, 1, 1, 0.75, 0.5, 2, 0.9, 1.3, 0.8, 1.4, 1.1, 1),
b1 = c(-2, -2, -2, -2.5, -2, -1.5, -1, -2.75, -1.8, -2.3, -2.6, -2.2, -2.2, -1.7, -1.4),
16
b2 = c(-1, -1, -1, -2, -0.5, 0, 1, -1.5, -1.2, -1.7, -1.4, -1.1, -1.1, -0.6, -0.1),
b3 = c(0, 0, 0, -1, 1, 1.5, 2, 1.5, 0.5, 2, 1.3, 1.4, 1.3, 0.8, 0.9),
b4 = c(1.5, 1.5, 1.5, 1, 2, 2, 3, 2, 1.8, 2.5, 1.9, 2.5, 2.1, 1.6, 1.9)
),
mod = "grm"
)
})
17
})
18
bind_cols(
q_1,
q_2 %>% select(starts_with("q2")),
q_3 %>% select(starts_with("q3"))
) %>%
left_join(crosssectional %>%
select(ID, additional_variable = y),
by = "ID") %>%
left_join(age_df, by = "ID")
})
19
2 Run the simulation
The following is the code executed on the command line with the arguments for the different number of
groups and n:
args = commandArgs(trailingOnly=TRUE)
number_groups <- as.character(args[1])
number_obs <- as.character(args[2])
library(dplyr)
library(FactoMineR)
library(factoextra)
library(longmixr)
library(flexmix)
library(lme4)
library(purrr)
# Dimension reduction
# do the dimension reduction by group
# groups:
# - questionnaire_1
# - questionnaire_2
# - questionnaire_3
# Data wrangling
# Bring the data into a format suitable for flexmix
20
# look how much variance the components explain and use the elbow plot method
# to decide on the number of components -> 2 for all settings
21
list_models <- lapply(response_names, function(x) {
FLXMRmgcv(as.formula(paste0(x, " ~ .")))
})
saveRDS(model, paste0("lcc_model_simulated_data_group_",
number_groups, "_n_", number_obs, "_2024_01_29.Rds"))
22