###############################################################################
# Script Title: Simulate EHR Data
# Purpose: Simulate electronic health record (EHR) data including BMI, height, weight,
#          and demographics, while injecting data quality issues for downstream analysis.
# Author: Lars Fritsche
# Date Created: 2025-03-26
# Date Last Modified: 2025-03-28 # <--- Updated modification date
# Usage: Rscript scripts/simulate_ehr_data.R [options]
# Example:
# Rscript scripts/simulate_ehr_data.R \
#    --output_ehr "./data/raw/ehr_bmi_simulated_data.tsv" \
#    --output_ehr_dict "./data/raw/data_dictionary.txt" \
#    --output_demo "./data/raw/demographics_simulated_data.tsv" \
#    --output_demo_dict "./data/raw/demographics_data_dictionary.txt" \
#    --seed 123 \
#    --n_individuals 1000
#
# Input Data: None (data is simulated within the script)
# Output Data:
#   - EHR dataset: Simulated EHR data with anthropometric measurements.
#   - Data dictionary: Description of variables in the EHR dataset.
#   - Demographics dataset: Simulated patient demographics.
#   - Demographics data dictionary: Description of variables in the demographics dataset.
# Dependencies:
#   - R version: 4.3.0 or higher
#   - Packages: optparse
###############################################################################

#------------------------------------------------------------------------------
# Library Management: Install and load required packages
#------------------------------------------------------------------------------
required_packages <- c("optparse") # For handling command-line arguments

# Install missing packages if necessary
for (pkg in required_packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
        message("Installing package: ", pkg)
        install.packages(pkg, repos = "https://cloud.r-project.org")
    }
    # Load required packages
    library(pkg, character.only = TRUE)
    message("Package loaded: ", pkg)
}

#------------------------------------------------------------------------------
# Parameter Initialization: Set up command-line options using optparse
#------------------------------------------------------------------------------
option_list <- list(
    make_option(c("--output_ehr"),
        type = "character", default = "./data/ehr_bmi_simulated_data.tsv",
        help = "Path to the EHR output TSV file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_ehr_dict"),
        type = "character", default = "./data/data_dictionary.txt",
        help = "Path to the EHR data dictionary file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_demo"),
        type = "character", default = "./data/demographics_simulated_data.tsv",
        help = "Path to the demographics output TSV file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_demo_dict"),
        type = "character", default = "./data/demographics_data_dictionary.txt",
        help = "Path to the demographics data dictionary file [default= %default]", metavar = "character"
    ),
    make_option(c("--seed"),
        type = "numeric", default = 123,
        help = "Random seed for reproducibility [default= %default]", metavar = "numeric"
    ),
    make_option(c("--n_individuals"),
        # Changed type to integer for better validation
        type = "integer", default = 1000,
        help = "Number of unique individuals to simulate [default= %default]", metavar = "integer"
    ),
    make_option(c("-v", "--verbose"),
        action = "store_true", default = TRUE,
        help = "Print extra output [default= %default]"
    ),
    make_option(c("-q", "--quiet"),
        action = "store_false", dest = "verbose",
        help = "Print little output"
    )
)

opt_parser <- OptionParser(
    option_list = option_list,
    usage = "Usage: Rscript %prog [options]"
)
opt <- parse_args(opt_parser)

# --- Input Validation ---
if (!is.integer(opt$n_individuals) || opt$n_individuals <= 0) {
    stop("--n_individuals must be a positive integer.", call. = FALSE)
}
if (!is.numeric(opt$seed)) {
     stop("--seed must be numeric.", call. = FALSE)
}
# --- End Input Validation ---

# Assign options to variables for easier use
output_ehr <- opt$output_ehr
output_ehr_dict <- opt$output_ehr_dict
output_demo <- opt$output_demo
output_demo_dict <- opt$output_demo_dict
seed_val <- opt$seed
n_individuals <- opt$n_individuals
verbose_output <- opt$verbose

#------------------------------------------------------------------------------
# Verbose Messaging Functionality
#------------------------------------------------------------------------------
vmsg <- function(...) {
    # Use message directly, no need for paste
    if (verbose_output) message(...)
}

#------------------------------------------------------------------------------
# Check and Prepare Output Directory
#------------------------------------------------------------------------------
output_dirs <- unique(dirname(c(output_ehr, output_ehr_dict, output_demo, output_demo_dict)))
for (dir in output_dirs) {
    if (!dir.exists(dir)) {
        message("Output directory '", dir, "' does not exist. Creating directory...")
        dir.create(dir, recursive = TRUE)
    }
}

#------------------------------------------------------------------------------
# Simulation Parameters Setup
#------------------------------------------------------------------------------
# Set seed for reproducibility
set.seed(seed_val)
vmsg("Random seed set to: ", seed_val)

# --- General Simulation Parameters ---
min_records_per_person <- 50  # Minimum measurements per individual
max_records_per_person <- 150 # Maximum measurements per individual
max_years_span         <- 10  # Maximum span of years for measurements
ehr_start_date         <- as.Date("2010-01-01")
ehr_end_date           <- as.Date("2019-12-31")

# --- ID Generation Parameters ---
person_id_length    <- 12 # Length of hashed person_id strings
encounter_id_length <- 12 # Length of hashed encounter_id strings
id_charset          <- c(0:9, LETTERS, letters) # Characters for IDs

# --- Anthropometric Simulation Parameters ---
height_mean           <- 170 # cm
height_sd             <- 10  # cm
height_min_plausible  <- 130 # cm (used in generation)
height_max_plausible  <- 210 # cm (used in generation)
bmi_mean              <- 26  # kg/m^2
bmi_sd                <- 6   # kg/m^2
bmi_min_plausible     <- 15  # kg/m^2 (used in generation)
bmi_max_plausible     <- 50  # kg/m^2 (used in generation)
weight_change_mean    <- 0.2 # kg/year trend
weight_change_sd      <- 0.5 # kg/year trend SD
weight_noise_sd       <- 2.0 # kg short-term fluctuation SD
height_measurement_error_prob <- 0.1 # Probability of +/- 1cm error

# --- Data Quality Injection Parameters ---
# Missingness
prop_height_missing      <- 0.10
prop_weight_missing      <- 0.05
prop_bmi_missing_added <- 0.03 # Additional BMI missing (where H/W present)
# Implausible Outliers
prop_height_outliers     <- 0.01
prop_weight_outliers     <- 0.01
prop_bmi_outliers        <- 0.01
height_outlier_low_max   <- 99    # cm, values below this are outliers
height_outlier_high_min  <- 251   # cm, values above this are outliers
weight_outlier_low_max   <- 24.9  # kg, values below this are outliers
weight_outlier_high_min  <- 300.1 # kg, values above this are outliers
bmi_outlier_low_max      <- 9.9   # kg/m^2, values below this are outliers
bmi_outlier_high_min     <- 70.1  # kg/m^2, values above this are outliers
# BMI Mismatches
prop_bmi_mismatch        <- 0.05
bmi_mismatch_sd_error    <- 2.0   # SD of error added to create mismatch
# Extreme/Nonsensical Outliers
n_extreme_outliers       <- 5     # Fixed number of specific extreme errors

# --- Demographics Simulation Parameters ---
dob_min_date            <- as.Date("1930-01-01")
dob_max_date            <- as.Date("2000-12-31")
demographics_ref_date   <- as.Date("2019-12-31") # Date for age calculation
prop_age_missing        <- 0.05
prob_deceased           <- c(Yes = 0.1, No = 0.9)
race_categories         <- c("White", "Black", "Asian", "Native American", "Pacific Islander", "Other", "Patient Refused", "Unknown", "")
race_probs              <- c(0.4, 0.15, 0.1, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05)
ethnicity_categories    <- c("Hispanic", "Non-Hispanic", "Patient Refused", "Unknown", "")
ethnicity_probs         <- c(0.2, 0.7, 0.05, 0.03, 0.02)
na_race_eth_values      <- c("Patient Refused", "Unknown", "") # Values to map to NA
sex_gender_categories   <- c("Male", "Female")
marital_status_categories <- c("Single", "Married", "Divorced", "Widowed")
marital_status_probs    <- c(0.4, 0.4, 0.15, 0.05)
zip3_min                <- 0
zip3_max                <- 999


#------------------------------------------------------------------------------
# Main Code Logic: Simulate EHR and Demographics Data
#------------------------------------------------------------------------------
message("Starting simulation of EHR and demographics data...")

# -----------------------------
# Generate Unique Person IDs
# -----------------------------
vmsg("Generating unique person IDs...")
person_ids <- replicate(n_individuals, paste(sample(id_charset, person_id_length, replace = TRUE), collapse = ""))
# Ensure uniqueness by regenerating any duplicates.
# Note: Collisions are extremely unlikely with these ID parameters but check is kept for robustness.
while (length(unique(person_ids)) < n_individuals) {
    vmsg("...regenerating duplicate person IDs (rare event)...")
    dup_idx <- duplicated(person_ids) | duplicated(person_ids, fromLast = TRUE)
    person_ids[dup_idx] <- replicate(sum(dup_idx), paste(sample(id_charset, person_id_length, replace = TRUE), collapse = ""))
}

# -----------------------------
# Simulate Measurements for Each Person
# -----------------------------
vmsg("Simulating measurements for each individual...")

# Functions to generate plausible base height and BMI values
generate_height <- function() {
    # Note: Could add max iterations for safety, but unlikely necessary here.
    h <- rnorm(1, mean = height_mean, sd = height_sd)
    while (h < height_min_plausible || h > height_max_plausible) {
        h <- rnorm(1, mean = height_mean, sd = height_sd)
    }
    return(round(h)) # Round to nearest cm
}

generate_bmi <- function() {
    # Note: Could add max iterations for safety, but unlikely necessary here.
    b <- rnorm(1, mean = bmi_mean, sd = bmi_sd)
    while (b < bmi_min_plausible || b > bmi_max_plausible) {
        b <- rnorm(1, mean = bmi_mean, sd = bmi_sd)
    }
    return(b)
}

# Initialize list to collect records for each individual
records_list <- vector("list", n_individuals)

for (i in 1:n_individuals) {
    if (i %% 100 == 0) vmsg("...simulating individual ", i, " of ", n_individuals) # Progress indicator

    # Determine a random number of records for the individual
    n_rec <- sample(min_records_per_person:max_records_per_person, 1)
    pid <- person_ids[i]

    # Generate stable attributes for the individual
    height_cm_base <- generate_height()
    bmi_base <- generate_bmi()
    weight_kg_base <- bmi_base * (height_cm_base / 100)^2 # Derive weight from BMI and height
    yearly_weight_change <- rnorm(1, mean = weight_change_mean, sd = weight_change_sd) # Annual weight change (kg/year)

    # Determine the measurement date range
    start_day_offset <- sample(0:(365 * (max_years_span - 1)), 1)
    person_start_date <- ehr_start_date + start_day_offset
    span_days <- sample(0:(365 * max_years_span), 1)
    span_days <- max(span_days, n_rec - 1) # Ensure span is sufficient for n_rec distinct days if needed
    person_end_date <- person_start_date + span_days
    if (person_end_date > ehr_end_date) {
        person_end_date <- ehr_end_date
    }
    # Ensure start date is not after end date if span is short or hits boundary
    if(person_start_date > person_end_date) person_start_date <- person_end_date

    # Sample measurement dates (with replacement to allow multiple per day initially)
    if(person_start_date == person_end_date) {
         all_dates <- person_start_date
    } else {
         all_dates <- seq.Date(person_start_date, person_end_date, by = "day")
    }
    measurement_dates <- sample(all_dates, size = n_rec, replace = TRUE)

    # Initialize data frame for the individual's records
    person_df <- data.frame(
        person_id        = pid,
        measurement_date = measurement_dates,
        height_cm        = numeric(n_rec),
        weight_kg        = numeric(n_rec),
        bmi              = numeric(n_rec)
    )

    # Order records chronologically to simulate realistic trends, then revert to original order
    ord <- order(person_df$measurement_date)
    sorted_dates <- person_df$measurement_date[ord]

    for (j in seq_along(sorted_dates)) {
        date_j <- sorted_dates[j]
        # Calculate elapsed years since the start date for trend calculation
        years_since_start <- as.numeric(difftime(date_j, person_start_date, units = "days")) / 365.25

        # Simulate height with occasional +/- 1 cm measurement error
        if (runif(1) < height_measurement_error_prob) {
            height_val <- height_cm_base + sample(c(-1, 1), 1)
        } else {
            height_val <- height_cm_base
        }
        # Ensure height doesn't go below a reasonable minimum due to error
        if(height_val < height_min_plausible - 5) height_val <- height_min_plausible - 5

        # Simulate weight as baseline plus trend and short-term fluctuation
        expected_weight <- weight_kg_base + yearly_weight_change * years_since_start
        weight_val <- expected_weight + rnorm(1, mean = 0, sd = weight_noise_sd)
        if (weight_val < 0) weight_val <- 0 # Ensure weight is non-negative

        # Calculate BMI from the current height and weight
        # Use max(1, height_val) to avoid division by zero or negative heights
        bmi_val <- ifelse(height_val > 0, weight_val / ((height_val / 100)^2), NA)

        # Assign computed values back to the record (in the original sampled date order)
        person_df$height_cm[ord[j]] <- height_val
        person_df$weight_kg[ord[j]] <- weight_val
        person_df$bmi[ord[j]] <- bmi_val
    }

    # Round values to appropriate precision before storing
    person_df$height_cm <- round(person_df$height_cm)
    person_df$weight_kg <- round(person_df$weight_kg, 1)
    person_df$bmi <- round(person_df$bmi, 1)

    records_list[[i]] <- person_df
}

# Combine all individuals' records into one data frame
# Note: For very large n_individuals, consider data.table::rbindlist or dplyr::bind_rows
ehr_data <- do.call(rbind, records_list)
total_records <- nrow(ehr_data)
vmsg("Total EHR records simulated: ", total_records)

# -----------------------------
# Inject Data Quality Issues
# -----------------------------
vmsg("Injecting data quality issues into EHR data...")
n <- total_records # Use shorter alias

# 1. Introduce missing values
vmsg("...injecting missing values...")
height_missing_idx <- sample(1:n, size = floor(prop_height_missing * n))
weight_missing_idx <- sample(1:n, size = floor(prop_weight_missing * n))
ehr_data$height_cm[height_missing_idx] <- NA
ehr_data$weight_kg[weight_missing_idx] <- NA

# Add BMI missing where weight or height is missing, plus some extra
bmi_missing_from_hw <- which(is.na(ehr_data$height_cm) | is.na(ehr_data$weight_kg))
potential_bmi_missing_added <- setdiff(1:n, bmi_missing_from_hw)
bmi_missing_added_idx <- sample(potential_bmi_missing_added, size = floor(prop_bmi_missing_added * n))
ehr_data$bmi[union(bmi_missing_from_hw, bmi_missing_added_idx)] <- NA


# 2. Inject implausible anthropometric values (outliers)
vmsg("...injecting outlier values...")
# Height outliers
valid_h_idx <- which(!is.na(ehr_data$height_cm))
height_outlier_idx <- sample(valid_h_idx, size = floor(prop_height_outliers * n))
h_n <- length(height_outlier_idx)
if (h_n > 0) {
    low_h_idx <- sample(height_outlier_idx, size = floor(h_n / 2))
    high_h_idx <- setdiff(height_outlier_idx, low_h_idx)
    ehr_data$height_cm[low_h_idx] <- sample(50:height_outlier_low_max, length(low_h_idx), replace = TRUE)
    ehr_data$height_cm[high_h_idx] <- sample(height_outlier_high_min:300, length(high_h_idx), replace = TRUE)
}

# Weight outliers
valid_w_idx <- which(!is.na(ehr_data$weight_kg))
# Ensure we don't pick indices already used for height outliers if possible
potential_w_outlier_idx <- setdiff(valid_w_idx, height_outlier_idx)
weight_outlier_idx <- sample(potential_w_outlier_idx, size = floor(prop_weight_outliers * n))
w_n <- length(weight_outlier_idx)
if (w_n > 0) {
    low_w_idx <- sample(weight_outlier_idx, size = floor(w_n / 2))
    high_w_idx <- setdiff(weight_outlier_idx, low_w_idx)
    ehr_data$weight_kg[low_w_idx] <- round(runif(length(low_w_idx), 10, weight_outlier_low_max), 1)
    ehr_data$weight_kg[high_w_idx] <- round(runif(length(high_w_idx), weight_outlier_high_min, 500), 1)
}

# BMI outliers
valid_b_idx <- which(!is.na(ehr_data$bmi))
# Ensure we don't pick indices already used for height/weight outliers if possible
potential_b_outlier_idx <- setdiff(valid_b_idx, c(height_outlier_idx, weight_outlier_idx))
bmi_outlier_idx <- sample(potential_b_outlier_idx, size = floor(prop_bmi_outliers * n))
b_n <- length(bmi_outlier_idx)
if (b_n > 0) {
    low_b_idx <- sample(bmi_outlier_idx, size = floor(b_n / 2))
    high_b_idx <- setdiff(bmi_outlier_idx, low_b_idx)
    ehr_data$bmi[low_b_idx] <- round(runif(length(low_b_idx), 5, bmi_outlier_low_max), 1)
    ehr_data$bmi[high_b_idx] <- round(runif(length(high_b_idx), bmi_outlier_high_min, 100), 1)
}

# 3. Introduce BMI discrepancies (where BMI != calculated BMI)
vmsg("...injecting BMI calculation mismatches...")
# Identify rows with valid H, W, B that haven't been made outliers yet
all_outlier_or_missing_idx <- unique(c(
    which(is.na(ehr_data$height_cm)), which(is.na(ehr_data$weight_kg)), which(is.na(ehr_data$bmi)),
    height_outlier_idx, weight_outlier_idx, bmi_outlier_idx
))
clean_idx <- setdiff(1:n, all_outlier_or_missing_idx)
mismatch_idx <- sample(clean_idx, size = floor(prop_bmi_mismatch * n))

for (idx in mismatch_idx) {
    # Double check validity just in case (should be valid based on clean_idx)
    if (!is.na(ehr_data$height_cm[idx]) && ehr_data$height_cm[idx] > 0 &&
        !is.na(ehr_data$weight_kg[idx]) && ehr_data$weight_kg[idx] >= 0 &&
        !is.na(ehr_data$bmi[idx]))
    {
        orig_bmi <- ehr_data$bmi[idx]
        # Add a small error to the recorded BMI to create a mismatch
        err <- rnorm(1, mean = 0, sd = bmi_mismatch_sd_error)
        new_bmi <- orig_bmi + err
        # Keep the mismatched BMI within a broadly plausible range
        if (new_bmi < bmi_min_plausible - 5) new_bmi <- bmi_min_plausible - 5
        if (new_bmi > bmi_max_plausible + 20) new_bmi <- bmi_max_plausible + 20
        ehr_data$bmi[idx] <- round(new_bmi, 1)
    }
}

# 4. Inject rare extreme or nonsensical outliers
vmsg("...injecting extreme/nonsensical outliers...")
# Sample from indices not already heavily modified
potential_extreme_idx <- setdiff(clean_idx, mismatch_idx)
if(length(potential_extreme_idx) >= n_extreme_outliers && n_extreme_outliers > 0) {
    extreme_idx <- sample(potential_extreme_idx, size = n_extreme_outliers)
    # Assign specific nonsensical values to showcase different error types
    # Ensure indices are assigned sequentially if n_extreme_outliers < 5
    if(n_extreme_outliers >= 1) ehr_data$height_cm[extreme_idx[1]] <- -5    # Negative height
    if(n_extreme_outliers >= 2) ehr_data$height_cm[extreme_idx[2]] <- 999   # Very large height
    if(n_extreme_outliers >= 3) ehr_data$weight_kg[extreme_idx[3]] <- -10   # Negative weight
    if(n_extreme_outliers >= 4) ehr_data$weight_kg[extreme_idx[4]] <- 800   # Very large weight
    if(n_extreme_outliers >= 5) ehr_data$bmi[extreme_idx[5]]       <- 150   # Very large BMI
}

# Ensure rounding consistency after all injections
ehr_data$height_cm <- round(ehr_data$height_cm)
ehr_data$weight_kg <- round(ehr_data$weight_kg, 1)
ehr_data$bmi <- round(ehr_data$bmi, 1)

# -----------------------------
# Assign Encounter IDs
# -----------------------------
vmsg("Assigning encounter IDs to each record...")
encounter_ids <- replicate(total_records, paste(sample(id_charset, encounter_id_length, replace = TRUE), collapse = ""))
# Check for unlikely duplicates
while (length(unique(encounter_ids)) < total_records) {
    vmsg("...regenerating duplicate encounter IDs (rare event)...")
    dup_idx <- duplicated(encounter_ids) | duplicated(encounter_ids, fromLast = TRUE)
    encounter_ids[dup_idx] <- replicate(sum(dup_idx), paste(sample(id_charset, encounter_id_length, replace = TRUE), collapse = ""))
}
ehr_data$encounter_id <- encounter_ids

# -----------------------------
# Convert Measurement Dates to Date-Time Format
# -----------------------------
vmsg("Converting measurement dates to date-time format...")
# Convert Date objects to POSIXct at the start of the day
ehr_data$measurement_date <- as.POSIXct(as.Date(ehr_data$measurement_date))
# Add a random time offset (0 to 86399 seconds) within each day
random_seconds <- runif(total_records, 0, 86400 - 1)
ehr_data$measurement_date <- ehr_data$measurement_date + random_seconds
# Format as 'YYYY-MM-DD HH:MM:SS'
ehr_data$measurement_date <- format(ehr_data$measurement_date, "%Y-%m-%d %H:%M:%S")

# -----------------------------
# Reorder Columns for Final EHR Output
# -----------------------------
ehr_data <- ehr_data[, c("person_id", "encounter_id", "bmi", "height_cm", "weight_kg", "measurement_date")]

# -----------------------------
# Output: Write EHR Dataset and Data Dictionary
# -----------------------------
vmsg("Writing EHR dataset and data dictionary to files...")

# Write the simulated EHR dataset to a TSV file
tryCatch(
    {
        write.table(ehr_data, file = output_ehr, sep = "\t", quote = FALSE, row.names = FALSE, na = "")
        vmsg("EHR dataset successfully saved to: ", output_ehr)
    },
    error = function(e) {
        stop("Error writing EHR dataset to '", output_ehr, "': ", e$message, call. = FALSE)
    }
)

# --- Generate and Write EHR Data Dictionary ---
ehr_colnames <- colnames(ehr_data)
dict_descriptions_ehr <- c(
    # Descriptions MUST be in the same order as columns in ehr_data
    "Hashed unique identifier for an individual (consistent across records).",
    "Hashed unique identifier for the encounter (unique per record).",
    "Body Mass Index (kg/m^2); may be missing, implausible, inconsistent, or nonsensical.",
    "Height in centimeters; may be missing, implausible (<100 or >250), or nonsensical.",
    "Weight in kilograms; may be missing, implausible (<25 or >300), or nonsensical.",
    "Date and time of measurement (YYYY-MM-DD HH:MM:SS)."
)

if (length(ehr_colnames) == length(dict_descriptions_ehr)) {
     dict_lines_ehr <- paste0(ehr_colnames, ": ", dict_descriptions_ehr)
} else {
     warning("Mismatch between EHR column names and dictionary descriptions. Writing basic names only.")
     dict_lines_ehr <- paste0(ehr_colnames, ": [Description Mismatch - Check Script]")
}

tryCatch(
    {
        writeLines(dict_lines_ehr, con = output_ehr_dict)
        vmsg("EHR data dictionary successfully saved to: ", output_ehr_dict)
    },
    error = function(e) {
        stop("Error writing EHR data dictionary to '", output_ehr_dict, "': ", e$message, call. = FALSE)
    }
)
# --- End EHR Dictionary ---

# -----------------------------
# Generate Demographics Data
# -----------------------------
vmsg("Generating demographics data...")
n_demo <- n_individuals # Number of demographic records = number of unique individuals

# Simulate Date of Birth
date_of_birth <- sample(seq.Date(dob_min_date, dob_max_date, by = "day"), n_demo, replace = TRUE)

# Compute age as of the reference date
age <- floor(as.numeric(difftime(demographics_ref_date, date_of_birth, units = "days")) / 365.25)
# Randomly set some ages to NA
na_age_idx <- sample(1:n_demo, size = floor(prop_age_missing * n_demo))
age[na_age_idx] <- NA

# Define age bins based on computed age
age_bin <- ifelse(is.na(age), NA_character_,
    ifelse(age < 18, "<18",
        ifelse(age < 35, "18-34",
            ifelse(age < 55, "35-54",
                ifelse(age < 75, "55-74", "75+")
            )
        )
    )
)

# Simulate deceased status
deceased <- sample(names(prob_deceased), n_demo, replace = TRUE, prob = prob_deceased)

# Simulate race and map specified values to NA
race_raw <- sample(race_categories, n_demo, replace = TRUE, prob = race_probs)
race_clean <- ifelse(race_raw %in% na_race_eth_values, NA_character_, race_raw)

# Simulate ethnicity and map specified values to NA
ethnicity_raw <- sample(ethnicity_categories, n_demo, replace = TRUE, prob = ethnicity_probs)
ethnicity_clean <- ifelse(ethnicity_raw %in% na_race_eth_values, NA_character_, ethnicity_raw)

# Combine race and ethnicity (only if both are non-NA)
race_ethnicity <- ifelse(is.na(race_clean) | is.na(ethnicity_clean),
    NA_character_,
    paste(race_clean, ethnicity_clean)
)

# Harmonize race and ethnicity into broader categories (vectorized approach)
race_ethnicity_harmonized <- ifelse(is.na(race_clean) | is.na(ethnicity_clean), NA_character_,
    ifelse(ethnicity_clean == "Hispanic", "Hispanic",
        ifelse(race_clean == "White", "Non-Hispanic White",
            ifelse(race_clean == "Black", "Non-Hispanic Black",
                ifelse(race_clean %in% c("Asian", "Native American", "Pacific Islander"),
                       "Non-Hispanic Asian/Pacific Islander/Native American",
                       "Other" # Includes 'Other' race category when Non-Hispanic
                )
            )
        )
    )
)

# Simulate sex/gender
sex_gender <- sample(sex_gender_categories, n_demo, replace = TRUE)

# Simulate marital status
marital_status_name <- sample(marital_status_categories, n_demo, replace = TRUE, prob = marital_status_probs)

# Simulate a three-digit ZIP code (with leading zeros)
zip3 <- sprintf("%03d", sample(zip3_min:zip3_max, n_demo, replace = TRUE))

# Combine demographics into a data frame
demo_data <- data.frame(
    person_id = person_ids,
    date_of_birth = format(date_of_birth, "%Y-%m-%d"),
    age = age,
    age_bin = age_bin,
    deceased = deceased,
    race_clean = race_clean,
    ethnicity_clean = ethnicity_clean,
    race_ethnicity = race_ethnicity,
    race_ethnicity_harmonized = race_ethnicity_harmonized,
    sex_gender = sex_gender,
    marital_status_name = marital_status_name,
    zip3 = zip3,
    stringsAsFactors = FALSE # Good practice
)

# -----------------------------
# Output: Write Demographics Dataset and Data Dictionary
# -----------------------------
# Write the demographics dataset to a TSV file
tryCatch(
    {
        write.table(demo_data, file = output_demo, sep = "\t", quote = FALSE, row.names = FALSE, na = "")
        vmsg("Demographics dataset successfully saved to: ", output_demo)
    },
    error = function(e) {
        stop("Error writing demographics dataset to '", output_demo, "': ", e$message, call. = FALSE)
    }
)

# --- Generate and Write Demographics Data Dictionary ---
demo_colnames <- colnames(demo_data)
dict_descriptions_demo <- c(
    # Descriptions MUST be in the same order as columns in demo_data
    "Unique identifier for the individual (matches EHR data).",
    "Date of birth in YYYY-MM-DD format.",
    paste("Age in years as of", format(demographics_ref_date), ". May contain NA."),
    "Age category based on calculated age (<18, 18-34, 35-54, 55-74, 75+).",
    "Indicator if the person is recorded as deceased (Yes/No).",
    paste("Self-reported race (NA for:", paste(na_race_eth_values, collapse=", "), ")."),
    paste("Self-reported ethnicity (NA for:", paste(na_race_eth_values, collapse=", "), ")."),
    "Combined race and ethnicity string (NA if either component is NA).",
    "Harmonized race/ethnicity classification (e.g., Hispanic, Non-Hispanic White, Non-Hispanic Black, Non-Hispanic Asian/Pacific Islander/Native American, Other).",
    "Recorded sex/gender of the individual.",
    "Self-reported marital status.",
    "Simulated three-digit ZIP code prefix."
)

if (length(demo_colnames) == length(dict_descriptions_demo)) {
     dict_lines_demo <- paste0(demo_colnames, ": ", dict_descriptions_demo)
} else {
     warning("Mismatch between Demographics column names and dictionary descriptions. Writing basic names only.")
     dict_lines_demo <- paste0(demo_colnames, ": [Description Mismatch - Check Script]")
}

tryCatch(
    {
        writeLines(dict_lines_demo, con = output_demo_dict)
        vmsg("Demographics data dictionary successfully saved to: ", output_demo_dict)
    },
    error = function(e) {
        stop("Error writing demographics data dictionary to '", output_demo_dict, "': ", e$message, call. = FALSE)
    }
)
# --- End Demographics Dictionary ---


#------------------------------------------------------------------------------
# Session Information: Record the R environment for reproducibility
#------------------------------------------------------------------------------
message("-------------------------------------------------------------------------------")
message("Session Information:")
sessionInfo()
message("-------------------------------------------------------------------------------")

# End of script message
message("Script execution completed successfully.")