Appendix A — Results: Ratings

Setup and libraries
source("setup_params.R")

library("tidyverse")
library("janitor")
library("stringr")
library("here")
library("knitr")
library("kableExtra")
library("ggrepel")
library("scales")
library("jsonlite")
library("purrr")
library("tibble")
# library("plotly")  # removed: all figures are now static ggplot

# Theme and colors — crisp palette (high saturation, maximum separation)
UJ_ORANGE <- "#E8722A"   # vivid saffron orange
UJ_GREEN  <- "#2D9D5E"   # rich emerald green
UJ_BLUE   <- "#2B7CE9"   # clear azure blue

MODEL_COLORS <- c(
  "GPT-5 Pro"        = "#E8722A",   # saffron orange (focal)
  "GPT-5.2 Pro"      = "#D62839",   # bright crimson
  "GPT-4o-mini"      = "#17B890",   # vivid teal
  "Claude Sonnet 4"  = "#A855F7",   # vivid purple
  "Claude Opus 4.6"  = "#7C3AED",   # deep violet
  "Gemini 2.0 Flash" = "#2B7CE9",   # clear azure
  "Human"            = "#2D9D5E"    # emerald green
)

theme_uj <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major   = element_line(linewidth = 0.3, color = "grey88"),
      plot.title          = element_text(face = "bold", size = rel(1.1)),
      plot.title.position = "plot",
      plot.subtitle       = element_text(color = "grey40", size = rel(0.9)),
      plot.caption        = element_text(color = "grey50", size = rel(0.8), hjust = 0),
      axis.title          = element_text(size = rel(0.95)),
      axis.text           = element_text(size = rel(0.88)),
      legend.position     = "bottom",
      legend.text         = element_text(size = rel(0.88)),
      legend.title        = element_text(size = rel(0.9), face = "bold"),
      strip.text          = element_text(face = "bold", size = rel(0.95))
    )
}

canon_metric <- function(x) dplyr::recode(
  x,
  "advancing_knowledge" = "adv_knowledge",
  "open_science"        = "open_sci",
  "logic_communication" = "logic_comms",
  "global_relevance"    = "gp_relevance",
  "claims_evidence"     = "claims",
  .default = x
)

`%||%` <- function(x, y) if (!is.null(x)) x else y

# Pricing per 1M tokens (USD)
pricing <- tribble(
  ~model,              ~input_per_m, ~output_per_m,
  "GPT-5 Pro",         15.00,        60.00,
  "GPT-5.2 Pro",       15.00,        60.00,
  "GPT-4o-mini",       0.15,         0.60,
  "Claude Sonnet 4",   3.00,         15.00,
  "Claude Opus 4.6",   15.00,        75.00,
  "Gemini 2.0 Flash",  0.075,        0.30
)
Load human evaluation data
# Load human ratings from Unjournal
UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |>
  mutate(label_paper_title = research, label_paper = paper) |>
  select(c("label_paper_title", "label_paper"))

rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |>
  clean_names() |>
  mutate(label_paper_title = research) |>
  select(-c("research"))

research <- read_csv("data/research.csv", show_col_types = FALSE) |>
  clean_names() |>
  filter(status == "50_published evaluations (on PubPub, by Unjournal)") |>
  left_join(UJmap, by = c("label_paper_title")) |>
  mutate(doi = str_trim(doi)) |>
  mutate(label_paper = case_when(
    doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023",
    doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025",
    doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022",
    doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023",
    TRUE ~ label_paper
  )) |>
  left_join(rsx, by = c("label_paper_title"))

key_map <- research |>
  transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |>
  filter(!is.na(label_paper_title)) |>
  distinct(label_paper_title, label_paper) |>
  group_by(label_paper_title) |>
  slice(1) |>
  ungroup()

rsx_research <- rsx |>
  mutate(label_paper_title = str_trim(label_paper_title)) |>
  left_join(key_map, by = "label_paper_title", relationship = "many-to-one")

metrics_human <- rsx_research |>
  mutate(criteria = canon_metric(criteria)) |>
  filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>
  transmute(
    paper = label_paper, criteria, evaluator, model = "Human",
    mid = as.numeric(middle_rating),
    lo = suppressWarnings(as.numeric(lower_ci)),
    hi = suppressWarnings(as.numeric(upper_ci))
  ) |>
  filter(!is.na(paper), !is.na(mid)) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),
    hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_)
  ) |>
  mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |>
  distinct(paper, criteria, model, evaluator, mid, lo, hi)

human_avg <- metrics_human |>
  filter(criteria == "overall") |>
  group_by(paper) |>
  summarise(
    human_mid = mean(mid, na.rm = TRUE),
    human_lo = mean(lo, na.rm = TRUE),
    human_hi = mean(hi, na.rm = TRUE),
    n_human = n(),
    .groups = "drop"
  )

# Summary stats for inline use
n_human_papers <- n_distinct(metrics_human$paper)
n_human_evaluators <- n_distinct(metrics_human$evaluator)
Load LLM evaluation data (all models)
model_dirs <- list(
  "gpt5_pro_updated_jan2026" = "GPT-5 Pro",
  "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro",
  "gpt_4o_mini_2024_07_18" = "GPT-4o-mini",
  "claude_sonnet_4_20250514" = "Claude Sonnet 4",
  "claude_opus_4_6" = "Claude Opus 4.6",
  "gemini_2.0_flash" = "Gemini 2.0 Flash"
)

parse_response <- function(path, model_name) {
  tryCatch({
    r <- jsonlite::fromJSON(path, simplifyVector = FALSE)
    paper <- basename(path) |>
      str_replace("\\.response\\.json$", "") |>
      str_replace_all("_", " ")

    parsed <- NULL
    if (!is.null(r$parsed) && length(r$parsed) > 0) {
      parsed <- r$parsed
    } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) {
      # Strip markdown code fences if present
      txt <- r$output_text
      txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt)
      txt <- sub("\\s*```\\s*$", "", txt)
      parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE)
    } else if (!is.null(r$output)) {
      msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL)
      if (!is.null(msg) && length(msg$content) > 0) {
        parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE)
      }
    }

    if (is.null(parsed)) return(NULL)

    metrics <- parsed$metrics
    metric_rows <- list()
    tier_rows <- list()

    tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will")

    for (nm in names(metrics)) {
      if (nm %in% tier_names) {
        # Normalise journal_should/journal_will → tier_should/tier_will
        tier_kind <- sub("^journal_", "tier_", nm)
        tier_rows[[length(tier_rows) + 1]] <- tibble(
          paper = paper, model = model_name, tier_kind = tier_kind,
          score = metrics[[nm]]$score,
          ci_lower = metrics[[nm]]$ci_lower,
          ci_upper = metrics[[nm]]$ci_upper
        )
      } else {
        metric_rows[[length(metric_rows) + 1]] <- tibble(
          paper = paper, model = model_name, metric = nm,
          midpoint = metrics[[nm]]$midpoint,
          lower_bound = metrics[[nm]]$lower_bound,
          upper_bound = metrics[[nm]]$upper_bound
        )
      }
    }

    input_tok <- r$usage$input_tokens %||% r$input_tokens
    output_tok <- r$usage$output_tokens %||% r$output_tokens
    reasoning_tok <- r$usage$output_tokens_details$reasoning_tokens

    list(
      metrics = bind_rows(metric_rows),
      tiers = bind_rows(tier_rows),
      tokens = tibble(
        paper = paper, model = model_name,
        input_tokens = input_tok %||% NA_integer_,
        output_tokens = output_tok %||% NA_integer_,
        reasoning_tokens = reasoning_tok %||% NA_integer_
      ),
      summary = tibble(
        paper = paper, model = model_name,
        assessment_summary = parsed$assessment_summary
      )
    )
  }, error = function(e) NULL)
}

load_all_llm <- function() {
  all_metrics <- list()
  all_tiers <- list()
  all_tokens <- list()
  all_summaries <- list()

  for (dir_name in names(model_dirs)) {
    model_name <- model_dirs[[dir_name]]
    json_dir <- here("results", dir_name, "json")

    if (dir.exists(json_dir)) {
      files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE)

      for (f in files) {
        result <- parse_response(f, model_name)
        if (!is.null(result)) {
          all_metrics[[length(all_metrics) + 1]] <- result$metrics
          all_tiers[[length(all_tiers) + 1]] <- result$tiers
          all_tokens[[length(all_tokens) + 1]] <- result$tokens
          all_summaries[[length(all_summaries) + 1]] <- result$summary
        }
      }
    }
  }

  list(
    metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)),
    tiers = bind_rows(all_tiers),
    tokens = bind_rows(all_tokens),
    summaries = bind_rows(all_summaries)
  )
}

llm_data <- load_all_llm()
llm_metrics <- llm_data$metrics
llm_tiers <- llm_data$tiers
llm_tokens <- llm_data$tokens
llm_summaries <- llm_data$summaries

# Summary stats for inline use
n_llm_models <- n_distinct(llm_metrics$model)
n_llm_papers <- n_distinct(llm_metrics$paper)
llm_model_names <- unique(llm_metrics$model)
llm_model_list <- paste(llm_model_names, collapse = ", ")

# Papers with both human and LLM data
matched_papers <- intersect(
 unique(metrics_human$paper),
 unique(llm_metrics$paper)
)
n_matched <- length(matched_papers)

# Opus sample: papers evaluated by Claude Opus 4.6 AND humans
# Agreement tables and Krippendorff alpha use this sample so all
# models are compared on the same set of papers.
opus_papers <- intersect(
  llm_metrics |> filter(model == "Claude Opus 4.6") |> pull(paper) |> unique(),
  unique(metrics_human$paper)
)
n_opus <- length(opus_papers)

# Cost summary
cost_data <- llm_tokens |>
  left_join(pricing, by = "model") |>
  mutate(
    reasoning_tokens = coalesce(reasoning_tokens, 0L),
    cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) / 1e6
  )

total_cost <- sum(cost_data$cost_usd, na.rm = TRUE)
total_evaluations <- nrow(llm_tokens)

# Per-model summaries
model_summary <- cost_data |>
  group_by(model) |>
  summarise(
    n_papers = n(),
    avg_input = round(mean(input_tokens, na.rm = TRUE)),
    avg_output = round(mean(output_tokens, na.rm = TRUE)),
    avg_reasoning = round(mean(reasoning_tokens[reasoning_tokens > 0], na.rm = TRUE)),
    total_cost = sum(cost_usd, na.rm = TRUE),
    cost_per_paper = mean(cost_usd, na.rm = TRUE),
    .groups = "drop"
  )

We compare quantitative ratings from 6 frontier LLMs (GPT-5 Pro, GPT-5.2 Pro, GPT-4o-mini, Claude Sonnet 4, Claude Opus 4.6, Gemini 2.0 Flash) against human expert reviews from The Unjournal. Across all models, 45 papers have both human and LLM evaluations. Agreement metrics and Krippendorff’s alpha below are computed on the 33-paper matched sample with Claude Opus 4.6 evaluations, so that all models are compared on identical papers.

We submit each PDF through a single, schema-enforced API call. The system prompt mirrors The Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a ~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric.1 Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty.

Cost and token usage. Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with reasoning = high).

Show code
if (nrow(llm_tokens) > 0) {

  cost_table <- model_summary |>
    transmute(
      Model = model,
      Papers = n_papers,
      `Avg input` = scales::comma(avg_input),
      `Avg output` = scales::comma(avg_output),
      `Avg reasoning` = ifelse(is.finite(avg_reasoning) & avg_reasoning > 0,
                                scales::comma(avg_reasoning), "—"),
      `Total cost` = scales::dollar(total_cost, accuracy = 0.01),
      `Cost/paper` = scales::dollar(cost_per_paper, accuracy = 0.001)
    ) |>
    arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`))))

  knitr::kable(cost_table, align = c("l", rep("r", 6)))
}
Table A.1: Token usage and estimated cost per model
Model Papers Avg input Avg output Avg reasoning Total cost Cost/paper
Claude Opus 4.6 45 116,930 1,509 $84.02 $1.867
GPT-5 Pro 60 38,516 6,400 4,717 $57.71 $0.962
GPT-5.2 Pro 14 45,143 2,889 707 $11.91 $0.850
Claude Sonnet 4 16 113,798 727 $5.64 $0.352
GPT-4o-mini 60 71,581 470 $0.66 $0.011
Gemini 2.0 Flash 60 18,084 1,024 $0.10 $0.002

Total cost across all 255 LLM evaluations: $160.03. Reasoning-capable models (GPT-5 Pro, GPT-5.2 Pro, Claude Opus 4.6) cost substantially more per paper than their lighter counterparts (GPT-4o-mini, Gemini 2.0 Flash), raising practical questions about cost-quality trade-offs for deployment at scale.

Agreement metrics. We report Spearman ρ (rank-based, robust to outliers), MAE in percentile points, mean bias (LLM − Human), and Pearson r. Full definitions appear in Methods. Caution on comparability: LLM ρ is computed against the mean of 1.9 human raters, while the Human–Human row compares individual raters to each other. Because averaging reduces noise, the raw LLM ρ is upward-biased relative to ρ_HH; the Spearman-Brown adjusted column corrects for this.

Show code
MIN_N <- 15

if (nrow(scatter_data) > 0) {

  llm_rows_raw <- scatter_data |>
    group_by(Model = model) |>
    summarise(
      N           = n(),
      raw_rho     = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"),
      `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE        = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      MAE         = sprintf("%.1f", mean(abs(midpoint - human_mid), na.rm = TRUE)),
      .groups     = "drop"
    ) |>
    arrange(desc(raw_rho)) |>
    mutate(
      flag           = ifelse(N < MIN_N, "†", ""),
      `ρ (vs. mean)` = sprintf("%.3f%s", raw_rho, flag),
      `ρ adj. (SB)`  = sprintf("%.3f%s", raw_rho * sb_factor, flag),
      `95% CI`       = mapply(spearman_ci95, raw_rho, N)
    ) |>
    select(Model, N, `ρ (vs. mean)`, `ρ adj. (SB)`, `95% CI`, `Pearson r`, `Mean bias`, MAE)

  hh_row <- tibble(
    Model          = "Human–Human",
    N              = hh_n,
    `ρ (vs. mean)` = sprintf("%.3f", hh_spearman),
    `ρ adj. (SB)`  = "—",
    `95% CI`       = spearman_ci95(hh_spearman, hh_n),
    `Pearson r`    = round(hh_pearson, 3),
    `Mean bias`    = hh_bias,
    MAE            = sprintf("%.1f", hh_mae)
  )

  agreement_tbl <- bind_rows(hh_row, llm_rows_raw)

  knitr::kable(agreement_tbl, align = c("l", rep("r", ncol(agreement_tbl) - 1))) |>
    kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0")
}
Table A.2: Agreement between each LLM and human mean overall rating, matched sample (N = r n_opus). Human–Human row (top, bold) shows pairwise evaluator Spearman ρ (individual vs. individual). ρ (vs. mean): raw Spearman ρ between LLM and human mean rating — upward-biased as a comparison to ρ_HH because the mean suppresses noise. ρ adj. (SB): Spearman-Brown corrected to individual-rater-equivalent scale (×r sprintf('%.2f', sb_factor) for k≈r round(k_raters, 1) raters/paper); this is the fair comparison to ρ_HH. 95% CI: Fisher-z interval. Bias = LLM − Human (positive = LLM more generous). †N < 15: unreliable estimate.
Model N ρ (vs. mean) ρ adj. (SB) 95% CI Pearson r Mean bias MAE
Human–Human 27 0.472 [0.11, 0.72] 0.584 +3.5 11.3
GPT-5 Pro 33 0.553 0.480 [0.26, 0.75] 0.396 +5.1 10.0
Claude Opus 4.6 33 0.477 0.414 [0.16, 0.70] 0.567 -7.6 12.4
Gemini 2.0 Flash 33 0.447 0.388 [0.12, 0.68] 0.467 -5.3 10.8
GPT-5.2 Pro 5 0.410† 0.356† [-0.74, 0.95] 0.433 -6.6 15.4
GPT-4o-mini 33 0.309 0.268 [-0.04, 0.59] 0.356 -0.1 10.7
Claude Sonnet 4 8 -0.058† -0.050† [-0.73, 0.67] -0.232 -1.6 6.8

Table A.2 restricts all models to the 33-paper sample where every model (including Claude Opus 4.6) has an evaluation, enabling a fair cross-model comparison on identical papers. The raw ρ (vs. mean) is not directly comparable to ρ_HH: LLMs are evaluated against the mean of 1.9 human raters, which suppresses idiosyncratic noise and inflates apparent agreement relative to an individual-vs-individual comparison. The Spearman-Brown correction (ρ adj.) converts raw LLM ρ to the individual-rater-equivalent scale by multiplying by 0.87 — this is the fair comparator to the Human–Human ρ of 0.472. The 95% CIs (Fisher-z) are wide at this sample size; model rankings should be interpreted cautiously. Models with N < 15 (marked †) are shown for completeness but yield unreliable estimates.

Table A.3 repeats the same analysis using each model’s own maximum matched sample, maximising N per model at the cost of cross-model comparability.

Show code
if (nrow(llm_metrics) > 0 && nrow(human_avg) > 0) {

  scatter_full <- llm_metrics |>
    filter(criteria == "overall") |>
    inner_join(human_avg |> select(paper, human_mid), by = "paper") |>
    filter(!is.na(human_mid), !is.na(midpoint))

  # HH baseline on all matched papers (not restricted to opus_papers)
  hh_pairs_full <- metrics_human |>
    filter(criteria == "overall", paper %in% unique(scatter_full$paper)) |>
    select(paper, evaluator, mid) |>
    distinct() |>
    group_by(paper) |>
    filter(n() >= 2) |>
    mutate(slot = paste0("E", row_number())) |>
    ungroup() |>
    pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |>
    filter(!is.na(mid_E1), !is.na(mid_E2))

  hh_rho_full <- round(cor(hh_pairs_full$mid_E1, hh_pairs_full$mid_E2,
                             method = "spearman"), 3)

  k_full <- metrics_human |>
    filter(criteria == "overall", paper %in% unique(scatter_full$paper)) |>
    group_by(paper) |>
    summarise(k = n_distinct(evaluator), .groups = "drop") |>
    summarise(k = mean(k)) |>
    pull(k)
  r_kk_full  <- k_full * hh_rho_full / (1 + (k_full - 1) * hh_rho_full)
  sb_full    <- sqrt(hh_rho_full / r_kk_full)

  llm_rows_full <- scatter_full |>
    group_by(Model = model) |>
    summarise(
      N           = n(),
      raw_rho     = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"),
      `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      MAE         = sprintf("%.1f", mean(abs(midpoint - human_mid), na.rm = TRUE)),
      .groups     = "drop"
    ) |>
    arrange(desc(raw_rho)) |>
    mutate(
      flag           = ifelse(N < 15, "†", ""),
      `ρ (vs. mean)` = sprintf("%.3f%s", raw_rho, flag),
      `ρ adj. (SB)`  = sprintf("%.3f%s", raw_rho * sb_full, flag),
      `95% CI`       = mapply(spearman_ci95, raw_rho, N)
    ) |>
    select(Model, N, `ρ (vs. mean)`, `ρ adj. (SB)`, `95% CI`, `Pearson r`, `Mean bias`, MAE)

  hh_row_full <- tibble(
    Model          = "Human–Human",
    N              = nrow(hh_pairs_full),
    `ρ (vs. mean)` = sprintf("%.3f", hh_rho_full),
    `ρ adj. (SB)`  = "—",
    `95% CI`       = spearman_ci95(hh_rho_full, nrow(hh_pairs_full)),
    `Pearson r`    = round(cor(hh_pairs_full$mid_E1, hh_pairs_full$mid_E2), 3),
    `Mean bias`    = "—",
    MAE            = "—"
  )

  tbl_full <- bind_rows(hh_row_full, llm_rows_full)

  knitr::kable(tbl_full, align = c("l", rep("r", ncol(tbl_full) - 1))) |>
    kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0") |>
    kableExtra::footnote(symbol = "† N < 15 papers; estimates unreliable.")
}
Table A.3: Agreement between each LLM and human mean overall rating, using each model’s own maximum matched sample (not restricted to the common r n_opus-paper set). Larger N per model but cross-model comparison is confounded by paper-set differences. Same columns and Spearman-Brown correction as Table A.2.
Model N ρ (vs. mean) ρ adj. (SB) 95% CI Pearson r Mean bias MAE
Human–Human 37 0.432 [0.13, 0.66] 0.565
GPT-5 Pro 45 0.517 0.444 [0.26, 0.70] 0.342 +6.4 10.3
Claude Opus 4.6 33 0.477 0.409 [0.16, 0.70] 0.567 -7.6 12.4
GPT-5.2 Pro 9 0.471† 0.404† [-0.28, 0.86] 0.333 -2.7 14.7
Gemini 2.0 Flash 45 0.313 0.269 [0.02, 0.56] 0.342 -5.3 11.7
GPT-4o-mini 45 0.116 0.099 [-0.18, 0.40] 0.135 +0.0 13.1
Claude Sonnet 4 11 0.055† 0.047† [-0.56, 0.63] 0.020 +0.2 6.9
* † N < 15 papers; estimates unreliable.

Human baseline context. Human percentile ratings are a noisy reference signal, not ground truth: even expert evaluators given the same paper differ systematically, reflecting genuine uncertainty in what a “correct” score would be. Table A.4 quantifies this noise as Krippendorff’s αHH. The αHL columns show how close each LLM comes to that level. A model with αHL ≈ αHH on a criterion is generating signal comparable to a second human evaluator on that dimension. Criteria where αHH is already low indicate genuine expert disagreement; low αHL on those criteria is therefore expected, not a model failure.

Show code
if (requireNamespace("irr", quietly = TRUE)) {

  library(irr)

  criteria_list <- c("overall", "claims", "methods", "adv_knowledge",
                     "logic_comms", "open_sci", "gp_relevance")

  criterion_labels <- c(
    overall = "Overall", claims = "Claims", methods = "Methods",
    adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
    open_sci = "Open Science", gp_relevance = "Global Relevance"
  )

  # Compute human-human alpha (each evaluator as separate rater)
  compute_hh_alpha <- function(criterion) {
    human_wide <- metrics_human |>
      filter(criteria == criterion, paper %in% opus_papers) |>
      select(paper, evaluator, mid) |>
      distinct() |>
      pivot_wider(names_from = paper, values_from = mid)

    if (nrow(human_wide) < 2 || ncol(human_wide) < 3) return(NA_real_)

    M <- as.matrix(human_wide[, -1, drop = FALSE])
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Compute human-LLM alpha for a specific model
  compute_hl_alpha <- function(criterion, model_name) {
    human_mean <- metrics_human |>
      filter(criteria == criterion, paper %in% opus_papers) |>
      group_by(paper) |>
      summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop")

    llm_rating <- llm_metrics |>
      filter(criteria == criterion, model == model_name, paper %in% opus_papers) |>
      select(paper, midpoint) |>
      rename(LLM = midpoint)

    combined <- inner_join(human_mean, llm_rating, by = "paper")
    if (nrow(combined) < 3) return(NA_real_)

    M <- rbind(Human = combined$Human, LLM = combined$LLM)
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Get unique LLM models and their paper counts (to flag low-N columns)
  llm_models <- unique(llm_metrics$model)
  model_n_df <- llm_metrics |>
    filter(criteria == "overall", paper %in% opus_papers) |>
    group_by(model) |>
    summarise(n = n_distinct(paper), .groups = "drop")
  model_n_map <- setNames(model_n_df$n, model_n_df$model)

  MIN_N_ALPHA <- 15

  # Build comparison table
  baseline_results <- tibble(Criterion = criterion_labels[criteria_list]) |>
    mutate(criterion_key = criteria_list) |>
    rowwise() |>
    mutate(
      `α_HH` = compute_hh_alpha(criterion_key)
    ) |>
    ungroup()

  # Add columns for each LLM model, labelled with N and flagged if N < MIN_N_ALPHA
  for (mod in llm_models) {
    n_mod        <- if (mod %in% names(model_n_map)) as.integer(model_n_map[mod]) else 0L
    n_label      <- if (n_mod < MIN_N_ALPHA) sprintf("n=%d†", n_mod) else sprintf("n=%d", n_mod)
    col_raw      <- paste0("α_HL (", mod, ")")
    col_labelled <- paste0("α_HL ", mod, " (", n_label, ")")
    baseline_results <- baseline_results |>
      rowwise() |>
      mutate(!!col_raw := compute_hl_alpha(criterion_key, mod)) |>
      ungroup()
    names(baseline_results)[names(baseline_results) == col_raw] <- col_labelled
  }

  # Format for display — put α_HH immediately after Criterion column
  baseline_display <- baseline_results |>
    select(-criterion_key) |>
    mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "\u2014", sprintf("%.2f", .x))))

  alpha_col_order <- c("Criterion", "\u03b1_HH",
                       sort(grep("^\u03b1_HL", names(baseline_display), value = TRUE)))
  existing_cols <- alpha_col_order[alpha_col_order %in% names(baseline_display)]
  baseline_display <- baseline_display |> select(all_of(existing_cols))

  knitr::kable(baseline_display, align = c("l", rep("r", ncol(baseline_display) - 1))) |>
    kableExtra::column_spec(2, bold = TRUE) |>
    kableExtra::footnote(
      symbol = "\u2020 N < 15 papers; \u03b1_HL unreliable for this model."
    )
}
Table A.4: Krippendorff’s αHH (inter-human agreement, bold) vs αHL (human–LLM agreement) by criterion, matched sample. Column headers show N papers per model; models with N < 15 (marked †) are unreliable. Note: αHL is computed against the human mean while αHH uses individual raters—the same mean-vs-individual asymmetry as in Table A.2. αHH is the reference level; an LLM reaching αHL ≈ αHH provides signal comparable to an additional human rater for that dimension.
Criterion α_HH α_HL Claude Opus 4.6 (n=33) α_HL Claude Sonnet 4 (n=8†) α_HL Gemini 2.0 Flash (n=33) α_HL GPT-4o-mini (n=33) α_HL GPT-5 Pro (n=33) α_HL GPT-5.2 Pro (n=5†)
Overall 0.58 0.47 -0.15 0.37 0.36 0.33 0.45
Claims 0.07 0.26 -0.17 0.42 0.25 0.33 0.49
Methods 0.29 0.39 -0.30 0.27 0.31 0.32 0.21
Adv. Knowledge 0.13 0.32 0.10 0.10 0.22 0.23 0.20
Logic & Comms 0.47 0.09 0.14 -0.04 0.21 -0.12 -0.19
Open Science 0.27 -0.18 0.33 0.08 -0.08 -0.07 -0.52
Global Relevance 0.41 0.18 0.02 0.21 0.05 0.25 0.69
* † N < 15 papers; α_HL unreliable for this model.

Where αHL approaches αHH, the LLM’s ratings align with humans comparably to how a second human evaluator would — bearing in mind that αHL uses the human mean as one “rater” while αHH uses individual raters, so the comparison slightly favours the LLM. Criteria where αHH is already low indicate genuine evaluator disagreement; low αHL on those criteria reflects task difficulty rather than a model failure. Extended agreement analysis is available in the supplementary material.

Criteria-level patterns. Figure A.1 displays the mean rating each evaluator type assigns across the seven criteria. Criteria where all models and humans converge (similar colour intensity) suggest robust agreement on that dimension; criteria with divergent columns indicate systematic differences in how humans and LLMs weigh that aspect of quality.

Show code
metric_labels <- c(
  overall = "Overall", claims = "Claims", methods = "Methods",
  adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
  open_sci = "Open Science", gp_relevance = "Global Relevance"
)

criteria_means <- bind_rows(
  metrics_human |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(criteria) |>
    summarise(mean_rating = mean(mid, na.rm = TRUE), .groups = "drop") |>
    mutate(model = "Human"),
  llm_metrics |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(model, criteria) |>
    summarise(mean_rating = mean(midpoint, na.rm = TRUE), .groups = "drop")
) |>
  mutate(criteria_label = factor(metric_labels[criteria], levels = metric_labels))

if (nrow(criteria_means) > 0) {
  ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) +
    geom_tile(color = "white", linewidth = 0.5) +
    geom_text(aes(label = round(mean_rating, 0)), size = 4.2, color = "white", fontface = "bold") +
    scale_fill_gradient(low = "#f0f0f0", high = UJ_ORANGE, limits = c(40, 80), name = "Mean rating") +
    labs(x = NULL, y = NULL) +
    theme_uj() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 11),
          axis.text.y = element_text(size = 11),
          panel.grid = element_blank())
}
Figure A.1: Mean rating by evaluator type (rows) and criterion (columns), averaged across all matched papers. Tile colour scales from light (lower mean) to vivid orange (higher mean), clamped to 40–80. Criteria where all rows show similar colour intensity indicate broad agreement; divergent columns flag criteria-level systematic differences.

Additional models. Figure A.2 shows overall ratings for the two lightweight models—GPT-4o-mini and Gemini 2.0 Flash—against human mean ratings. These models cost 50–200× less per paper than reasoning-capable models (see Table A.1) but may sacrifice rating accuracy.

Show code
other_models <- c("GPT-4o-mini", "Gemini 2.0 Flash")

scatter_other <- llm_metrics |>
  filter(criteria == "overall", model %in% other_models) |>
  inner_join(human_avg, by = "paper") |>
  mutate(
    diff = midpoint - human_mid,
    paper_short = str_trunc(paper, 25),
    human_lo = coalesce(human_lo, human_mid),
    human_hi = coalesce(human_hi, human_mid),
    lower_bound = coalesce(lower_bound, midpoint),
    upper_bound = coalesce(upper_bound, midpoint)
  ) |>
  filter(!is.na(human_mid), !is.na(midpoint))

if (nrow(scatter_other) > 0) {
  ggplot(scatter_other, aes(x = human_mid, y = midpoint, color = model)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(size = 3.5, alpha = 0.7) +
    ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8, show.legend = FALSE) +
    scale_color_manual(values = MODEL_COLORS) +
    coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
    facet_wrap(~model, ncol = 2) +
    labs(x = "Human mean rating (0\u2013100)", y = "LLM rating (0\u2013100)", color = NULL) +
    theme_uj() +
    theme(strip.text = element_text(size = 13, face = "bold"))
} else {
  cat("No matching data for lightweight model scatter plot.\n")
}
Figure A.2: Human mean overall rating (x-axis) vs LLM overall rating (y-axis) for the two lightweight models GPT-4o-mini and Gemini 2.0 Flash. Each point is one paper; the dashed diagonal is the identity line. These models cost 50–200× less per paper than reasoning-capable models (see cost table above).

For per-paper scatter plots, rank comparisons, and the gap heatmap showing Human − LLM differences by paper and criterion, see the main Results chapter. Extended reasoning traces are available in Appendix C: LLM Traces.


  1. 0–100 percentiles relative to a reference group of ~‘all serious work you have read in this area in the last three years’, overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), and global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.↩︎