ben-domingue / irw

Code related to data for the Item Response Warehouse
https://datapages.github.io/irw/
7 stars 10 forks source link

Downstream integration #618

Open ben-domingue opened 4 days ago

ben-domingue commented 4 days ago

Functionality to support basic integration with standard psychometric tools

ben-domingue commented 3 days ago

parallel analysis c/o yiqing.

perform_parallel_analysis <- function(dir_path, output_file, ni = 25) {

  library(psych)
  library(dplyr)
  library(tidyr)
  library(writexl)

  # List all .Rdata files in the directory with their full paths
  rdata_files <- list.files(path = dir_path, pattern = "\\.Rdata$", full.names = TRUE)

  # Initialize a results data frame
  results <- data.frame(dataset = character(), dimension = integer(), stringsAsFactors = FALSE)

  # Loop through each .Rdata file
  for (data_path in rdata_files) {
    # Initialize a result placeholder with NA for dimension
    result <- data.frame(dataset = basename(data_path), dimension = NA, stringsAsFactors = FALSE)

    tryCatch({
      load(data_path)
      # Assuming the dataset is named 'df'

      # Select a random sample of IDs if the dataset is large
      id <- unique(df$id)
      if (length(id) > 10000) id <- sample(id, 10000)
      df <- df[df$id %in% id,]

      # Spread data, summarise by mean, and pivot wider while dropping the ID column
      df <- df %>%
        group_by(id, item) %>%
        summarise(resp = mean(resp, na.rm = TRUE), .groups = "drop") %>%
        pivot_wider(names_from = item, values_from = resp) %>%
        select(-id)

      # Calculate the sparsity of the entire dataset
      sparsity <- sum(is.na(df)) / prod(dim(df))

      # Check the overall sparsity of the dataset
      if (sparsity > 0.5) {
        message(sprintf("Dataset '%s' has a sparsity level of %f, which is out of the specified range. It will be skipped.", basename(data_path), sparsity))
        result$dimension <- 'sparsity > 0.5'
        result$skipped <- TRUE
      } else {
        # Handle missing values by replacing them with column means
        df <- data.frame(lapply(df, function(col) {
          if (is.numeric(col)) {
            col[is.na(col)] <- mean(col, na.rm = TRUE)
          }
          return(col)
        }))

        # Remove columns with zero variance
        df <- df[sapply(df, var, na.rm = TRUE) > 0]

        # Run Parallel Analysis to determine the number of factors
        pa <- fa.parallel(df, fa = "both", n.iter = ni, plot = FALSE)
        num_factors_pa <- pa$nfact

        # Update the result with the number of factors
        result$dimension <- num_factors_pa
      }

    }, error = function(e) {
      message("Error in dataset ", basename(data_path), ": ", conditionMessage(e))
      result$skipped <- TRUE
    })

    # Append result to results data frame
    results <- rbind(results, result)

    # Write the current results to the Excel file after each iteration
    write_xlsx(results, path = output_file)
  }

  return(results)
}

# Example usage:
dir_path <- "~"  # Update this path to your data folder
output_file <- "~/parallel_analysis_results.xlsx"  # Update this path to the desired output file

perform_parallel_analysis(dir_path, output_file, ni = 25) # You can choose the times of iterations you want