Results

Setup and libraries

source("setup_params.R")

library("tidyverse")
library("janitor")
library("stringr")
library("here")
library("knitr")
library("kableExtra")
library("ggrepel")
library("scales")
library("jsonlite")
library("purrr")
library("tibble")

# Theme and colors — crisp palette (high saturation, maximum separation)
UJ_ORANGE <- "#E8722A"   # vivid saffron orange
UJ_GREEN  <- "#2D9D5E"   # rich emerald green
UJ_BLUE   <- "#2B7CE9"   # clear azure blue

MODEL_COLORS <- c(
  "GPT-5 Pro"        = "#E8722A",   # saffron orange (focal)
  "GPT-5.2 Pro"      = "#D62839",   # bright crimson
  "GPT-4o-mini"      = "#17B890",   # vivid teal
  "Claude Sonnet 4"  = "#A855F7",   # vivid purple
  "Claude Opus 4.6"  = "#7C3AED",   # deep violet
  "Gemini 2.0 Flash" = "#2B7CE9",   # clear azure
  "Human"            = "#2D9D5E"    # emerald green
)

theme_uj <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major   = element_line(linewidth = 0.3, color = "grey88"),
      plot.title          = element_text(face = "bold", size = rel(1.1)),
      plot.title.position = "plot",
      plot.subtitle       = element_text(color = "grey40", size = rel(0.9)),
      plot.caption        = element_text(color = "grey50", size = rel(0.8), hjust = 0),
      axis.title          = element_text(size = rel(0.95)),
      axis.text           = element_text(size = rel(0.88)),
      legend.position     = "bottom",
      legend.text         = element_text(size = rel(0.88)),
      legend.title        = element_text(size = rel(0.9), face = "bold"),
      strip.text          = element_text(face = "bold", size = rel(0.95))
    )
}

canon_metric <- function(x) dplyr::recode(
  x,
  "advancing_knowledge" = "adv_knowledge",
  "open_science"        = "open_sci",
  "logic_communication" = "logic_comms",
  "global_relevance"    = "gp_relevance",
  "claims_evidence"     = "claims",
  .default = x
)

`%||%` <- function(x, y) if (!is.null(x)) x else y

Load human evaluation data

UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |>
  mutate(label_paper_title = research, label_paper = paper) |>
  select(c("label_paper_title", "label_paper"))

rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |>
  clean_names() |>
  mutate(label_paper_title = research) |>
  select(-c("research"))

research <- read_csv("data/research.csv", show_col_types = FALSE) |>
  clean_names() |>
  filter(status == "50_published evaluations (on PubPub, by Unjournal)") |>
  left_join(UJmap, by = c("label_paper_title")) |>
  mutate(doi = str_trim(doi)) |>
  mutate(label_paper = case_when(
    doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023",
    doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025",
    doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022",
    doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023",
    TRUE ~ label_paper
  )) |>
  left_join(rsx, by = c("label_paper_title"))

key_map <- research |>
  transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |>
  filter(!is.na(label_paper_title)) |>
  distinct(label_paper_title, label_paper) |>
  group_by(label_paper_title) |>
  slice(1) |>
  ungroup()

rsx_research <- rsx |>
  mutate(label_paper_title = str_trim(label_paper_title)) |>
  left_join(key_map, by = "label_paper_title", relationship = "many-to-one")

metrics_human <- rsx_research |>
  mutate(criteria = canon_metric(criteria)) |>
  filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>
  transmute(
    paper = label_paper, criteria, evaluator, model = "Human",
    mid = as.numeric(middle_rating),
    lo = suppressWarnings(as.numeric(lower_ci)),
    hi = suppressWarnings(as.numeric(upper_ci))
  ) |>
  filter(!is.na(paper), !is.na(mid)) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),
    hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_)
  ) |>
  mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |>
  distinct(paper, criteria, model, evaluator, mid, lo, hi)

human_avg <- metrics_human |>
  filter(criteria == "overall") |>
  group_by(paper) |>
  summarise(
    human_mid = mean(mid, na.rm = TRUE),
    human_lo = mean(lo, na.rm = TRUE),
    human_hi = mean(hi, na.rm = TRUE),
    n_human = n(),
    .groups = "drop"
  )

n_human_papers <- n_distinct(metrics_human$paper)
n_human_evaluators <- n_distinct(metrics_human$evaluator)

Load LLM evaluation data (all models)

model_dirs <- list(
  "gpt5_pro_updated_jan2026" = "GPT-5 Pro",
  "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro",
  "gpt_4o_mini_2024_07_18" = "GPT-4o-mini",
  "claude_sonnet_4_20250514" = "Claude Sonnet 4",
  "claude_opus_4_6" = "Claude Opus 4.6",
  "gemini_2.0_flash" = "Gemini 2.0 Flash"
)

parse_response <- function(path, model_name) {
  tryCatch({
    r <- jsonlite::fromJSON(path, simplifyVector = FALSE)
    paper <- basename(path) |>
      str_replace("\\.response\\.json$", "") |>
      str_replace_all("_", " ")

    parsed <- NULL
    if (!is.null(r$parsed) && length(r$parsed) > 0) {
      parsed <- r$parsed
    } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) {
      txt <- r$output_text
      txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt)
      txt <- sub("\\s*```\\s*$", "", txt)
      parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE)
    } else if (!is.null(r$output)) {
      msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL)
      if (!is.null(msg) && length(msg$content) > 0) {
        parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE)
      }
    }

    if (is.null(parsed)) return(NULL)

    metrics <- parsed$metrics
    metric_rows <- list()
    tier_rows <- list()
    tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will")

    for (nm in names(metrics)) {
      if (nm %in% tier_names) {
        tier_kind <- sub("^journal_", "tier_", nm)
        tier_rows[[length(tier_rows) + 1]] <- tibble(
          paper = paper, model = model_name, tier_kind = tier_kind,
          score = metrics[[nm]]$score,
          ci_lower = metrics[[nm]]$ci_lower,
          ci_upper = metrics[[nm]]$ci_upper
        )
      } else {
        metric_rows[[length(metric_rows) + 1]] <- tibble(
          paper = paper, model = model_name, metric = nm,
          midpoint = metrics[[nm]]$midpoint,
          lower_bound = metrics[[nm]]$lower_bound,
          upper_bound = metrics[[nm]]$upper_bound
        )
      }
    }

    input_tok <- r$usage$input_tokens %||% r$input_tokens
    output_tok <- r$usage$output_tokens %||% r$output_tokens

    list(
      metrics = bind_rows(metric_rows),
      tiers = bind_rows(tier_rows),
      tokens = tibble(
        paper = paper, model = model_name,
        input_tokens = input_tok %||% NA_integer_,
        output_tokens = output_tok %||% NA_integer_
      )
    )
  }, error = function(e) NULL)
}

load_all_llm <- function() {
  all_metrics <- list()
  all_tiers <- list()
  all_tokens <- list()

  for (dir_name in names(model_dirs)) {
    model_name <- model_dirs[[dir_name]]
    json_dir <- here("results", dir_name, "json")

    if (dir.exists(json_dir)) {
      files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE)

      for (f in files) {
        result <- parse_response(f, model_name)
        if (!is.null(result)) {
          all_metrics[[length(all_metrics) + 1]] <- result$metrics
          all_tiers[[length(all_tiers) + 1]] <- result$tiers
          all_tokens[[length(all_tokens) + 1]] <- result$tokens
        }
      }
    }
  }

  list(
    metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)),
    tiers = bind_rows(all_tiers),
    tokens = bind_rows(all_tokens)
  )
}

llm_data <- load_all_llm()
llm_metrics <- llm_data$metrics
llm_tiers <- llm_data$tiers
llm_tokens <- llm_data$tokens

n_llm_models <- n_distinct(llm_metrics$model)
n_llm_papers <- n_distinct(llm_metrics$paper)

matched_papers <- intersect(
  unique(metrics_human$paper),
  unique(llm_metrics$paper)
)
n_matched <- length(matched_papers)

# Focal sample: papers evaluated by GPT-5 Pro AND humans
focal_papers <- intersect(
  llm_metrics |> filter(model == "GPT-5 Pro") |> pull(paper) |> unique(),
  unique(metrics_human$paper)
)
n_focal <- length(focal_papers)

primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1]

We evaluate GPT-5 Pro against human expert reviews from The Unjournal on 45 matched papers (papers with both GPT-5 Pro and human evaluations). The model receives the same PDF, system prompt mirroring The Unjournal rubric, and JSON schema requiring a diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. Results for five additional models are reported in Appendix A. Full methodological details appear in Methods.

We do not treat human ratings as ground truth. Quantitative percentile scoring is genuinely difficult: even domain experts disagree, and individual scores reflect both signal about paper quality and idiosyncratic tendencies (severity, topic familiarity, interpretation of the scale). Our question is whether an LLM provides signal comparable to an additional expert rater. The Human–Human baseline row in Table 2.1 provides this reference. Caution: the LLM’s ρ is computed against the mean of 1.9 human raters, which reduces noise and inflates apparent agreement relative to the individual-vs-individual ρ_HH. The Spearman-Brown adjusted column corrects for this; the fair comparison is ρ adj. vs. ρ_HH. Krippendorff’s α_HH in the human-baseline table in Appendix A provides the criterion-level reference.

Per-paper overview. Figure 2.1 presents three complementary views of overall (0–100 percentile) ratings. Panel (a) displays individual human evaluator ratings alongside GPT-5 Pro (orange diamonds) for each paper, revealing inter-rater variability—the self-reported 90% credible intervals from individual evaluators often span 20–40 percentile points. In most cases the LLM falls within the range of human opinions, though several papers show substantial divergence. Panel (b) plots all pairwise human evaluator combinations, making the human-human agreement ceiling directly visible. Panel (c) compares GPT-5 Pro ratings against human mean ratings with per-paper labels.

Show code

library("patchwork")

# ── Panel (a): Forest plot ──
H_ind <- metrics_human |>
  filter(criteria == "overall", paper %in% matched_papers) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, lo), NA_real_),
    hi = ifelse(is.finite(hi), pmin(100, hi), NA_real_)
  )

ord <- H_ind |>
  group_by(paper) |>
  summarise(h_mean = mean(mid, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(h_mean)) |>
  mutate(pos = row_number())

H_plot <- H_ind |>
  inner_join(ord, by = "paper") |>
  group_by(paper) |>
  mutate(
    off = (row_number() - (n() + 1) / 2) * 0.18,
    x   = pos + off
  ) |>
  ungroup()

L_c <- llm_metrics |>
  filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(
    mid = mean(midpoint, na.rm = TRUE),
    lo  = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)),
    hi  = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)),
    .groups = "drop"
  ) |>
  inner_join(ord, by = "paper") |>
  mutate(x = pos - 0.18)

hbar     <- mean(ord$h_mean, na.rm = TRUE)
lbar     <- mean(L_c$mid, na.rm = TRUE)

# Legend labels embed grand means so the annotation text can be dropped
lbl_human <- sprintf("Human evaluators (mean = %.1f)", hbar)
lbl_gpt   <- sprintf("GPT-5 Pro (mean = %.1f)", lbar)

leg_colors <- setNames(c(UJ_GREEN, UJ_ORANGE),
                       c(lbl_human, lbl_gpt))
leg_shapes <- setNames(c(16L, 18L),
                       c(lbl_human, lbl_gpt))

p_forest <- if (nrow(L_c) > 0) {
  ggplot() +
    geom_vline(data = ord, aes(xintercept = pos), color = "grey92", linewidth = 0.3) +
    geom_hline(yintercept = hbar,      color = UJ_GREEN,    linetype = "dotted", linewidth = 0.8) +
    geom_hline(yintercept = lbar,      color = UJ_ORANGE,   linetype = "dotted", linewidth = 0.8) +
    geom_errorbar(
      data = subset(H_plot, is.finite(lo) & is.finite(hi)),
      aes(x = x, ymin = lo, ymax = hi),
      width = 0, linewidth = 1, alpha = 0.5, color = UJ_GREEN
    ) +
    geom_point(data = H_plot,
               aes(x = x, y = mid, color = lbl_human, shape = lbl_human),
               size = 3.0, alpha = 0.9) +
    geom_errorbar(
      data = subset(L_c, is.finite(lo) & is.finite(hi)),
      aes(x = x, ymin = lo, ymax = hi),
      width = 0, linewidth = 1.0, color = UJ_ORANGE
    ) +
    geom_point(data = L_c,
               aes(x = x, y = mid, color = lbl_gpt, shape = lbl_gpt),
               size = 3.6) +
    scale_color_manual(name = NULL, values = leg_colors,
                       breaks = c(lbl_human, lbl_gpt)) +
    scale_shape_manual(name = NULL, values = leg_shapes,
                       breaks = c(lbl_human, lbl_gpt)) +
    guides(
      color = guide_legend(override.aes = list(size = 3.8, alpha = 1)),
      shape = guide_legend(override.aes = list(size = 3.8))
    ) +
    scale_x_continuous(
      breaks = ord$pos, labels = ord$paper,
      expand = expansion(mult = c(0.01, 0.03))
    ) +
    coord_cartesian(ylim = c(0, 100), clip = "off") +
    labs(x = NULL, y = "Percentile (0\u2013100)", tag = "(a)") +
    theme_uj() +
    theme(
      axis.text.x      = element_text(angle = 55, hjust = 1, vjust = 1, size = 9),
      axis.title.y     = element_text(size = 12),
      panel.grid.major.x = element_blank(),
      plot.margin      = margin(5, 40, 5, 5),
      plot.tag         = element_text(face = "bold", size = 14),
      legend.position  = "bottom",
      legend.direction = "horizontal",
      legend.text      = element_text(size = 10),
      legend.key.size  = unit(0.5, "cm"),
      legend.spacing.x = unit(0.5, "cm")
    )
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void()
}

# ── Panel (b): Human-Human scatter ──
# All pairwise evaluator combinations for papers with ≥2 human raters
hh_scatter_data <- metrics_human |>
  filter(criteria == "overall") |>
  select(paper, evaluator, mid) |>
  distinct() |>
  group_by(paper) |>
  filter(n() >= 2) |>
  group_modify(\(df, key) {
    idx <- combn(nrow(df), 2)
    tibble(mid_a = df$mid[idx[1, ]], mid_b = df$mid[idx[2, ]])
  }) |>
  ungroup()

p_hh <- if (nrow(hh_scatter_data) > 0) {
  hh_rho <- cor(hh_scatter_data$mid_a, hh_scatter_data$mid_b,
                method = "spearman", use = "complete.obs")
  hh_lbl <- sprintf("Spearman \u03c1 = %.2f\nN = %d pairs", hh_rho, nrow(hh_scatter_data))
  ggplot(hh_scatter_data, aes(x = mid_a, y = mid_b)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(color = UJ_GREEN, size = 3, alpha = 0.7) +
    annotate("text", x = 4, y = 96, label = hh_lbl,
             hjust = 0, vjust = 1, size = 3.4, color = "grey30") +
    coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) +
    labs(x = "Evaluator A (overall)", y = "Evaluator B (overall)", tag = "(b)") +
    theme_uj() +
    theme(plot.tag = element_text(face = "bold", size = 14))
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void()
}

# ── Panel (c): LLM vs Human scatter ──
scatter_data <- llm_metrics |>
  filter(criteria == "overall", model == "GPT-5 Pro") |>
  inner_join(human_avg, by = "paper") |>
  mutate(
    diff = midpoint - human_mid,
    paper_short = str_trunc(paper, 25),
    human_lo = coalesce(human_lo, human_mid),
    human_hi = coalesce(human_hi, human_mid),
    lower_bound = coalesce(lower_bound, midpoint),
    upper_bound = coalesce(upper_bound, midpoint)
  ) |>
  filter(!is.na(human_mid), !is.na(midpoint))

# Per-model Spearman rho for facet annotations
scatter_rho <- scatter_data |>
  group_by(model) |>
  summarise(
    rho = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"),
    .groups = "drop"
  ) |>
  mutate(lbl = sprintf("Spearman \u03c1 = %.2f", rho),
         x = 4, y = 96)

p_scatter <- if (nrow(scatter_data) > 0) {
  rho_lbl <- sprintf("Spearman \u03c1 = %.2f", scatter_rho$rho[1])
  ggplot(scatter_data, aes(x = human_mid, y = midpoint)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(size = 3.5, alpha = 0.7, color = UJ_ORANGE) +
    ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8) +
    annotate("text", x = 4, y = 96, label = rho_lbl,
             hjust = 0, vjust = 1, size = 3.4, color = "grey30") +
    coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) +
    labs(x = "Human mean rating (0\u2013100)", y = "GPT-5 Pro rating (0\u2013100)", tag = "(c)") +
    theme_uj() +
    theme(
      plot.tag = element_text(face = "bold", size = 14)
    )
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "No matching data") + theme_void()
}

# ── Combine ──
bottom_row <- wrap_plots(p_hh, p_scatter, widths = c(1, 1))
(p_forest / bottom_row) + plot_layout(heights = c(1, 0.85))

Figure 2.1: Per-paper overall ratings (0–100 percentile). **(a)** Individual human evaluator midpoints (green circles) with each evaluator’s self-reported 90% credible interval (reflecting their own uncertainty about the true score; the vertical separation between green dots per paper reflects inter-rater disagreement) and GPT-5 Pro (orange diamonds with CI), sorted by descending human mean. Dotted horizontal lines show grand means. **(b)** Pairwise human evaluator agreement: each point is one evaluator pair (papers with 3 raters contribute 3 points). **(c)** Human mean vs GPT-5 Pro overall rating; dashed diagonal is the identity line. Compare panels (b) and (c) directly to see whether LLM-human scatter is tighter than human-human scatter.

Panel (b) makes the human-human ceiling directly visible: the scatter of evaluator pairs is no tighter than the GPT-5 Pro–human scatter in panel (c), and individual pairs disagree by as much as 30–40 percentile points on individual papers. GPT-5 Pro clusters around the identity line in the 40–80 range but diverges more at the extremes, compressing ratings toward the centre of the scale relative to humans—a pattern consistent with alignment training that discourages extreme outputs. Where humans rate a paper very highly or very harshly, the LLM typically pulls toward the middle. Full agreement metrics for all six models appear in Appendix A.

Rating differences by paper and criterion. Figure 2.2 unpacks Human − GPT-5 Pro differences across all seven criteria simultaneously. Each column is a paper, each row a criterion, and tile colour encodes the signed difference (human mean minus GPT-5 Pro midpoint). Green tiles indicate the human panel rated the paper higher; orange tiles indicate GPT-5 Pro was more generous. Papers are sorted by overall difference.

Show code

library("patchwork")

metric_order <- c("overall", "claims", "methods", "adv_knowledge",
                  "logic_comms", "open_sci", "gp_relevance")
metric_lab <- c(
  overall = "Overall",
  claims = "Claims & Evidence",
  methods = "Methods",
  adv_knowledge = "Adv. Knowledge",
  logic_comms = "Logic & Comms",
  open_sci = "Open Science",
  gp_relevance = "Global Relevance"
)

H_mean <- metrics_human |>
  filter(criteria %in% metric_order, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(h = mean(mid, na.rm = TRUE), .groups = "drop")

# Fixed paper order based on GPT-5 Pro overall difference
L_mean_primary <- llm_metrics |>
  filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop")

Ddiff_primary <- inner_join(H_mean, L_mean_primary, by = c("paper", "criteria")) |>
  mutate(diff = h - l)

ord_p <- Ddiff_primary |>
  filter(criteria == "overall") |>
  arrange(desc(diff)) |>
  pull(paper)

build_gap_panel <- function(model_name, show_y_labels = TRUE,
                            llm_col = UJ_ORANGE) {
  L_mean_m <- llm_metrics |>
    filter(criteria %in% metric_order, model == model_name, paper %in% matched_papers) |>
    group_by(paper, criteria) |>
    summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop")

  Ddiff_m <- inner_join(H_mean, L_mean_m, by = c("paper", "criteria")) |>
    mutate(
      diff = h - l,
      crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order])
    )

  p <- ggplot(Ddiff_m, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) +
    geom_tile(color = "white", linewidth = 0.25) +
    scale_fill_gradient2(
      low = llm_col, mid = "grey95", high = UJ_GREEN, midpoint = 0,
      name = "Human \u2212 LLM",
      limits = c(-30, 30),
      oob = scales::squish
    ) +
    labs(x = NULL, y = NULL, title = model_name) +
    theme_uj() +
    theme(
      axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 8),
      panel.grid = element_blank()
    )

  if (show_y_labels) {
    p <- p + theme(axis.text.y = element_text(size = 11))
  } else {
    p <- p + theme(axis.text.y = element_blank())
  }

  p
}

if (length(ord_p) > 0) {
  build_gap_panel("GPT-5 Pro", show_y_labels = TRUE, llm_col = UJ_ORANGE) +
    theme(legend.position = "bottom")
} else {
  cat("Insufficient data for gap heatmap.\n")
}

Figure 2.2: Human minus GPT-5 Pro rating difference for every paper (columns) and criterion (rows). Green tiles indicate the human mean was higher; orange tiles indicate GPT-5 Pro rated the paper higher. Colour is clamped to ±30 percentile points. Papers are sorted by overall difference.

Rows with uniformly green or orange tiles indicate papers where humans and GPT-5 Pro disagree systematically across all criteria, not just on one dimension. Columns with consistent colour suggest criteria-level biases—for instance, if “Open Science” is green across nearly all papers, humans may systematically reward data-sharing practices more than the model does. Multi-model comparisons in Appendix A reveal whether these disagreement patterns are GPT-5 Pro–specific or shared across frontier LLMs.

Qualitative critique comparison. Beyond numeric ratings, we compare the substantive critiques each model raises against the consensus issues identified by human experts. The figure below plots coverage—the fraction of human-identified concerns that the LLM also raised in some form—against precision—the fraction of LLM-raised issues that correspond to a substantive human concern. These metrics are assessed by GPT-5.2 Pro acting as an independent judge (see Methods for the judging protocol).

Load critique comparison data

comparison_results_file <- c(
  "results/key_issue_comp_results.json",
  "results/key_issues_comparison_results.json"
)
comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1]

has_critique_data <- FALSE

if (!is.na(comparison_results_file)) {
  comparison_file <- "results/key_issues_comparison.json"
  if (file.exists(comparison_file)) {
    comparison_data <- fromJSON(comparison_file)
    llm_results_raw <- fromJSON(comparison_results_file)
    llm_results <- llm_results_raw |>
      as_tibble() |>
      unnest_wider(comparison) |>
      select(
        gpt_paper,
        coverage_pct,
        precision_pct,
        any_of(c("matched_pairs", "unmatched_human", "unmatched_llm")),
        any_of(c("missed_issues", "extra_issues")),
        overall_rating,
        overall_justification,
        detailed_notes
      )
    comparison_data <- comparison_data |>
      left_join(llm_results, by = "gpt_paper")
    has_critique_data <- nrow(comparison_data) > 0 &&
      "coverage_pct" %in% names(comparison_data) &&
      any(!is.na(comparison_data$coverage_pct))
  }
}

Show code

if (has_critique_data) {
  crit_results <- comparison_data |>
    filter(!is.na(coverage_pct) & !is.na(precision_pct)) |>
    mutate(paper_short = str_trunc(gpt_paper, 25))

  ggplot(crit_results, aes(x = coverage_pct, y = precision_pct)) +
    geom_point(size = 4.5, color = UJ_ORANGE, alpha = 0.85) +
    geom_text(aes(label = paper_short), hjust = -0.1, vjust = 0.5, size = 2.6, check_overlap = TRUE) +
    geom_vline(xintercept = mean(crit_results$coverage_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) +
    geom_hline(yintercept = mean(crit_results$precision_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) +
    scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    labs(
      x = "Coverage (%) \u2014 Human issues captured by LLM",
      y = "Precision (%) \u2014 LLM issues that are substantive"
    ) +
    theme_uj()
}

Coverage and precision vary substantially across papers. Some papers achieve high scores on both dimensions, indicating strong alignment between human and LLM critiques, while others reveal the LLM missing key human concerns or raising issues absent from the expert consensus. Because these metrics are themselves LLM-assessed, they should be interpreted with the caveat that an LLM judge may systematically over- or under-credit matches relative to a human annotator; manual validation through our annotation tool is underway and preliminary results are consistent with the automated scores.

Detailed paper-by-paper comparisons—including matched issue pairs with severity labels, structural difference tables, and per-evaluator breakdowns—appear in Appendix B: Critiques & Key Issues. Extended quantitative analysis including all six models, per-criterion correlations, bootstrap confidence intervals, tier prediction accuracy, and cost-quality trade-offs is reported in Appendix A: Results Ratings. The full LLM reasoning traces and assessment summaries are available in Appendix C: LLM Traces.

Agreement summary. Table 2.1 shows GPT-5 Pro’s agreement with the human mean on overall ratings, alongside the Human–Human baseline as a reference. The Spearman-Brown adjusted column (ρ adj.) is the appropriate comparator to ρ_HH; see Appendix A for all six models. Full model comparison is in Appendix A.

Show code

model_compare_data <- llm_metrics |>
  filter(criteria == "overall", model == "GPT-5 Pro", paper %in% focal_papers) |>
  left_join(
    human_avg |> select(paper, human_mid),
    by = "paper"
  ) |>
  filter(!is.na(human_mid))

if (nrow(model_compare_data) > 0) {
  raw_rho <- cor(model_compare_data$human_mid, model_compare_data$midpoint,
                 method = "spearman", use = "complete.obs")
  n_gpt   <- nrow(model_compare_data)

  llm_row <- tibble(
    Model          = "GPT-5 Pro",
    N              = n_gpt,
    `ρ (vs. mean)` = sprintf("%.3f", raw_rho),
    `ρ adj. (SB)`  = sprintf("%.3f", raw_rho * sb_factor),
    `95% CI`       = spearman_ci95(raw_rho, n_gpt),
    `Pearson r`    = round(cor(model_compare_data$human_mid, model_compare_data$midpoint,
                               use = "complete.obs"), 3),
    `Mean bias`    = sprintf("%+.1f", mean(model_compare_data$midpoint - model_compare_data$human_mid,
                                           na.rm = TRUE)),
    MAE            = round(mean(abs(model_compare_data$midpoint - model_compare_data$human_mid),
                                na.rm = TRUE), 1)
  )

  hh_row_main <- tibble(
    Model          = "Human–Human",
    N              = hh_n,
    `ρ (vs. mean)` = sprintf("%.3f", hh_spearman),
    `ρ adj. (SB)`  = "—",
    `95% CI`       = spearman_ci95(hh_spearman, hh_n),
    `Pearson r`    = round(hh_pearson, 3),
    `Mean bias`    = hh_bias,
    MAE            = hh_mae
  )

  model_compare_tbl <- bind_rows(hh_row_main, llm_row)

  knitr::kable(model_compare_tbl, align = c("l", rep("r", ncol(model_compare_tbl) - 1)),
               booktabs = TRUE) |>
    kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0")
}

Table 2.1: GPT-5 Pro agreement with human mean overall rating (N = r n_focal matched papers). Human–Human row (bold) shows pairwise individual-vs-individual Spearman ρ as a reference. ρ (vs. mean): raw Spearman ρ between GPT-5 Pro and human mean—upward-biased because the mean suppresses noise. ρ adj. (SB): Spearman-Brown corrected to individual-rater-equivalent (×r sprintf('%.2f', sb_factor) for k≈r round(k_raters, 1) raters/paper); compare this to Human–Human ρ = r hh_spearman. 95% CI: Fisher-z. Bias = LLM − Human.

Model	N	ρ (vs. mean)	ρ adj. (SB)	95% CI	Pearson r	Mean bias	MAE
Human–Human	37	0.432	—	[0.13, 0.66]	0.565	+4.1	11.0
GPT-5 Pro	45	0.517	0.444	[0.26, 0.70]	0.342	+6.4	10.3

Table 2.2 breaks the same agreement metrics down by evaluation criterion for GPT-5 Pro. The H-H ρ column shows pairwise human-human Spearman ρ for each criterion as a reference — note that raw LLM ρ is upward-biased relative to H-H ρ by the mean-vs-individual asymmetry (see Table 2.1). Criteria where H-H ρ is itself low indicate genuine expert disagreement; low LLM agreement on those criteria is therefore expected rather than a model failure.

Show code

criteria_order <- c("overall", "claims", "methods", "adv_knowledge",
                    "logic_comms", "open_sci", "gp_relevance")
criteria_labels <- c(
  overall      = "Overall",
  claims       = "Claims & Evidence",
  methods      = "Methods",
  adv_knowledge = "Adv. Knowledge",
  logic_comms  = "Logic & Comms",
  open_sci     = "Open Science",
  gp_relevance = "Global Relevance"
)

# Human-human pairwise Spearman rho per criterion (ceiling reference)
hh_crit_pairs <- metrics_human |>
  filter(criteria %in% criteria_order, paper %in% focal_papers) |>
  select(paper, criteria, evaluator, mid) |>
  distinct() |>
  group_by(paper, criteria) |>
  filter(n() >= 2) |>
  mutate(slot = paste0("E", row_number())) |>
  ungroup() |>
  pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |>
  filter(!is.na(mid_E1), !is.na(mid_E2))

hh_crit_rho <- hh_crit_pairs |>
  group_by(criteria) |>
  summarise(
    `H-H ρ` = round(cor(mid_E1, mid_E2, method = "spearman", use = "complete.obs"), 3),
    .groups = "drop"
  )

human_by_crit <- metrics_human |>
  filter(criteria %in% criteria_order, paper %in% focal_papers) |>
  group_by(paper, criteria) |>
  summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop")

crit_compare_data <- llm_metrics |>
  filter(criteria %in% criteria_order, model == "GPT-5 Pro", paper %in% focal_papers) |>
  inner_join(human_by_crit, by = c("paper", "criteria"))

if (nrow(crit_compare_data) > 0) {
  crit_tbl <- crit_compare_data |>
    group_by(criteria) |>
    summarise(
      `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman",
                                    use = "complete.obs"), 3),
      `Pearson r`  = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias`       = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE              = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      .groups = "drop"
    ) |>
    left_join(hh_crit_rho, by = "criteria") |>
    mutate(Criterion = factor(criteria, levels = criteria_order,
                              labels = criteria_labels[criteria_order])) |>
    select(Criterion, `H-H ρ`, `Spearman ρ`, `Pearson r`, `Mean bias`, RMSE) |>
    arrange(Criterion)

  knitr::kable(crit_tbl, align = c("l", rep("r", 5))) |>
    kableExtra::column_spec(2, bold = TRUE)
}

Table 2.2: GPT-5 Pro agreement with human mean by criterion (N = r n_focal matched papers). H-H ρ shows pairwise human evaluator Spearman ρ (the reference for each criterion; see Table 2.1 note about mean-vs-individual bias). Positive bias = GPT-5 Pro rates higher on average. Note the variation: human evaluators agree strongly on some dimensions and barely at all on others (Open Science).

Criterion	H-H ρ	Spearman ρ	Pearson r	Mean bias	RMSE
Overall	0.432	0.517	0.342	+6.4	15.4
Claims & Evidence	0.400	0.388	0.213	+4.0	18.3
Methods	0.362	0.536	0.339	+4.5	18.5
Adv. Knowledge	0.179	0.459	0.303	+7.9	17.6
Logic & Comms	0.225	0.276	0.187	+9.6	16.5
Open Science	-0.033	0.125	0.166	-17.0	27.7
Global Relevance	0.130	0.429	0.345	+6.1	16.0

--- engine: knitr --- # Results ```{r} #| label: setup-results #| code-summary: "Setup and libraries" #| code-fold: true #| message: false #| warning: false source("setup_params.R") library("tidyverse") library("janitor") library("stringr") library("here") library("knitr") library("kableExtra") library("ggrepel") library("scales") library("jsonlite") library("purrr") library("tibble") # Theme and colors — crisp palette (high saturation, maximum separation) UJ_ORANGE <- "#E8722A" # vivid saffron orange UJ_GREEN <- "#2D9D5E" # rich emerald green UJ_BLUE <- "#2B7CE9" # clear azure blue MODEL_COLORS <- c( "GPT-5 Pro" = "#E8722A", # saffron orange (focal) "GPT-5.2 Pro" = "#D62839", # bright crimson "GPT-4o-mini" = "#17B890", # vivid teal "Claude Sonnet 4" = "#A855F7", # vivid purple "Claude Opus 4.6" = "#7C3AED", # deep violet "Gemini 2.0 Flash" = "#2B7CE9", # clear azure "Human" = "#2D9D5E" # emerald green ) theme_uj <- function(base_size = 12) { theme_minimal(base_size = base_size) + theme( panel.grid.minor = element_blank(), panel.grid.major = element_line(linewidth = 0.3, color = "grey88"), plot.title = element_text(face = "bold", size = rel(1.1)), plot.title.position = "plot", plot.subtitle = element_text(color = "grey40", size = rel(0.9)), plot.caption = element_text(color = "grey50", size = rel(0.8), hjust = 0), axis.title = element_text(size = rel(0.95)), axis.text = element_text(size = rel(0.88)), legend.position = "bottom", legend.text = element_text(size = rel(0.88)), legend.title = element_text(size = rel(0.9), face = "bold"), strip.text = element_text(face = "bold", size = rel(0.95)) ) } canon_metric <- function(x) dplyr::recode( x, "advancing_knowledge" = "adv_knowledge", "open_science" = "open_sci", "logic_communication" = "logic_comms", "global_relevance" = "gp_relevance", "claims_evidence" = "claims", .default = x ) `%||%` <- function(x, y) if (!is.null(x)) x else y ``` ```{r} #| label: load-human-data-results #| code-fold: true #| code-summary: "Load human evaluation data" #| message: false UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |> mutate(label_paper_title = research, label_paper = paper) |> select(c("label_paper_title", "label_paper")) rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |> clean_names() |> mutate(label_paper_title = research) |> select(-c("research")) research <- read_csv("data/research.csv", show_col_types = FALSE) |> clean_names() |> filter(status == "50_published evaluations (on PubPub, by Unjournal)") |> left_join(UJmap, by = c("label_paper_title")) |> mutate(doi = str_trim(doi)) |> mutate(label_paper = case_when( doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023", doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025", doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022", doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023", TRUE ~ label_paper )) |> left_join(rsx, by = c("label_paper_title")) key_map <- research |> transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |> filter(!is.na(label_paper_title)) |> distinct(label_paper_title, label_paper) |> group_by(label_paper_title) |> slice(1) |> ungroup() rsx_research <- rsx |> mutate(label_paper_title = str_trim(label_paper_title)) |> left_join(key_map, by = "label_paper_title", relationship = "many-to-one") metrics_human <- rsx_research |> mutate(criteria = canon_metric(criteria)) |> filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |> transmute( paper = label_paper, criteria, evaluator, model = "Human", mid = as.numeric(middle_rating), lo = suppressWarnings(as.numeric(lower_ci)), hi = suppressWarnings(as.numeric(upper_ci)) ) |> filter(!is.na(paper), !is.na(mid)) |> mutate( lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_), hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_) ) |> mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |> distinct(paper, criteria, model, evaluator, mid, lo, hi) human_avg <- metrics_human |> filter(criteria == "overall") |> group_by(paper) |> summarise( human_mid = mean(mid, na.rm = TRUE), human_lo = mean(lo, na.rm = TRUE), human_hi = mean(hi, na.rm = TRUE), n_human = n(), .groups = "drop" ) n_human_papers <- n_distinct(metrics_human$paper) n_human_evaluators <- n_distinct(metrics_human$evaluator) ``` ```{r} #| label: load-llm-data-results #| code-fold: true #| code-summary: "Load LLM evaluation data (all models)" #| message: false model_dirs <- list( "gpt5_pro_updated_jan2026" = "GPT-5 Pro", "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro", "gpt_4o_mini_2024_07_18" = "GPT-4o-mini", "claude_sonnet_4_20250514" = "Claude Sonnet 4", "claude_opus_4_6" = "Claude Opus 4.6", "gemini_2.0_flash" = "Gemini 2.0 Flash" ) parse_response <- function(path, model_name) { tryCatch({ r <- jsonlite::fromJSON(path, simplifyVector = FALSE) paper <- basename(path) |> str_replace("\\.response\\.json$", "") |> str_replace_all("_", " ") parsed <- NULL if (!is.null(r$parsed) && length(r$parsed) > 0) { parsed <- r$parsed } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) { txt <- r$output_text txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt) txt <- sub("\\s*```\\s*$", "", txt) parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE) } else if (!is.null(r$output)) { msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL) if (!is.null(msg) && length(msg$content) > 0) { parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE) } } if (is.null(parsed)) return(NULL) metrics <- parsed$metrics metric_rows <- list() tier_rows <- list() tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will") for (nm in names(metrics)) { if (nm %in% tier_names) { tier_kind <- sub("^journal_", "tier_", nm) tier_rows[[length(tier_rows) + 1]] <- tibble( paper = paper, model = model_name, tier_kind = tier_kind, score = metrics[[nm]]$score, ci_lower = metrics[[nm]]$ci_lower, ci_upper = metrics[[nm]]$ci_upper ) } else { metric_rows[[length(metric_rows) + 1]] <- tibble( paper = paper, model = model_name, metric = nm, midpoint = metrics[[nm]]$midpoint, lower_bound = metrics[[nm]]$lower_bound, upper_bound = metrics[[nm]]$upper_bound ) } } input_tok <- r$usage$input_tokens %||% r$input_tokens output_tok <- r$usage$output_tokens %||% r$output_tokens list( metrics = bind_rows(metric_rows), tiers = bind_rows(tier_rows), tokens = tibble( paper = paper, model = model_name, input_tokens = input_tok %||% NA_integer_, output_tokens = output_tok %||% NA_integer_ ) ) }, error = function(e) NULL) } load_all_llm <- function() { all_metrics <- list() all_tiers <- list() all_tokens <- list() for (dir_name in names(model_dirs)) { model_name <- model_dirs[[dir_name]] json_dir <- here("results", dir_name, "json") if (dir.exists(json_dir)) { files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE) for (f in files) { result <- parse_response(f, model_name) if (!is.null(result)) { all_metrics[[length(all_metrics) + 1]] <- result$metrics all_tiers[[length(all_tiers) + 1]] <- result$tiers all_tokens[[length(all_tokens) + 1]] <- result$tokens } } } } list( metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)), tiers = bind_rows(all_tiers), tokens = bind_rows(all_tokens) ) } llm_data <- load_all_llm() llm_metrics <- llm_data$metrics llm_tiers <- llm_data$tiers llm_tokens <- llm_data$tokens n_llm_models <- n_distinct(llm_metrics$model) n_llm_papers <- n_distinct(llm_metrics$paper) matched_papers <- intersect( unique(metrics_human$paper), unique(llm_metrics$paper) ) n_matched <- length(matched_papers) # Focal sample: papers evaluated by GPT-5 Pro AND humans focal_papers <- intersect( llm_metrics |> filter(model == "GPT-5 Pro") |> pull(paper) |> unique(), unique(metrics_human$paper) ) n_focal <- length(focal_papers) primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1] ``` ```{r} #| label: compute-hh-baseline #| include: false # Human-Human pairwise agreement (baseline for agreement tables). # For papers with ≥2 evaluators, assign E1/E2 by row order within paper, # then compute pairwise correlation and error metrics across all such papers. hh_pairs <- metrics_human |> filter(criteria == "overall", paper %in% focal_papers) |> select(paper, evaluator, mid) |> distinct() |> group_by(paper) |> filter(n() >= 2) |> mutate(slot = paste0("E", row_number())) |> ungroup() |> pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |> filter(!is.na(mid_E1), !is.na(mid_E2)) hh_n <- nrow(hh_pairs) hh_pearson <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2), 3) hh_spearman <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2, method = "spearman"), 3) hh_mae <- round(mean(abs(hh_pairs$mid_E1 - hh_pairs$mid_E2), na.rm = TRUE), 1) hh_rmse <- round(sqrt(mean((hh_pairs$mid_E1 - hh_pairs$mid_E2)^2, na.rm = TRUE)), 1) hh_bias <- sprintf("%+.1f", mean(hh_pairs$mid_E1 - hh_pairs$mid_E2, na.rm = TRUE)) hh_baseline_row <- tibble( Model = "Human\u2013Human", N = hh_n, `Spearman ρ` = hh_spearman, `Pearson r` = hh_pearson, `Mean bias` = hh_bias, RMSE = hh_rmse, MAE = hh_mae ) # Spearman-Brown correction factor (same logic as Appendix A). # GPT-5 Pro rho is computed against the k-rater human mean; rho_HH compares # individual raters. Multiply raw rho_HL by sb_factor for a fair comparison. k_raters <- metrics_human |> filter(criteria == "overall", paper %in% focal_papers) |> group_by(paper) |> summarise(k = n_distinct(evaluator), .groups = "drop") |> summarise(k = mean(k)) |> pull(k) r_kk <- k_raters * hh_spearman / (1 + (k_raters - 1) * hh_spearman) sb_factor <- sqrt(hh_spearman / r_kk) spearman_ci95 <- function(rho, n) { if (is.na(rho) || n < 4) return("—") z <- atanh(rho) se <- 1 / sqrt(n - 3) sprintf("[%.2f, %.2f]", tanh(z - 1.96 * se), tanh(z + 1.96 * se)) } ``` We evaluate GPT-5 Pro against human expert reviews from [The Unjournal](https://unjournal.pubpub.org) on `r n_focal` matched papers (papers with both GPT-5 Pro and human evaluations). The model receives the same PDF, system prompt mirroring The Unjournal rubric, and JSON schema requiring a diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. Results for five additional models are reported in [Appendix A](results_ratings.qmd). Full methodological details appear in [Methods](methods.qmd). We do not treat human ratings as ground truth. Quantitative percentile scoring is genuinely difficult: even domain experts disagree, and individual scores reflect both signal about paper quality and idiosyncratic tendencies (severity, topic familiarity, interpretation of the scale). Our question is whether an LLM provides signal *comparable to an additional expert rater*. The Human–Human baseline row in @tbl-model-agreement-main provides this reference. **Caution**: the LLM's *ρ* is computed against the *mean* of `r round(k_raters, 1)` human raters, which reduces noise and inflates apparent agreement relative to the individual-vs-individual ρ_HH. The Spearman-Brown adjusted column corrects for this; the fair comparison is ρ adj. vs. ρ_HH. Krippendorff's α~HH~ in the [human-baseline table in Appendix A](results_ratings.qmd) provides the criterion-level reference. **Per-paper overview.** @fig-overview-combined presents three complementary views of overall (0--100 percentile) ratings. Panel (a) displays individual human evaluator ratings alongside GPT-5 Pro (orange diamonds) for each paper, revealing inter-rater variability---the self-reported 90% credible intervals from individual evaluators often span 20--40 percentile points. In most cases the LLM falls within the range of human opinions, though several papers show substantial divergence. Panel (b) plots all pairwise human evaluator combinations, making the human-human agreement ceiling directly visible. Panel (c) compares GPT-5 Pro ratings against human mean ratings with per-paper labels. ```{r} #| label: fig-overview-combined #| fig-cap: "Per-paper overall ratings (0--100 percentile). **(a)** Individual human evaluator midpoints (green circles) with each evaluator's self-reported 90% credible interval (reflecting their own uncertainty about the true score; the vertical separation between green dots per paper reflects inter-rater disagreement) and GPT-5 Pro (orange diamonds with CI), sorted by descending human mean. Dotted horizontal lines show grand means. **(b)** Pairwise human evaluator agreement: each point is one evaluator pair (papers with 3 raters contribute 3 points). **(c)** Human mean vs GPT-5 Pro overall rating; dashed diagonal is the identity line. Compare panels (b) and (c) directly to see whether LLM-human scatter is tighter than human-human scatter." #| fig-width: 14 #| fig-height: 14 #| code-fold: true library("patchwork") # ── Panel (a): Forest plot ── H_ind <- metrics_human |> filter(criteria == "overall", paper %in% matched_papers) |> mutate( lo = ifelse(is.finite(lo), pmax(0, lo), NA_real_), hi = ifelse(is.finite(hi), pmin(100, hi), NA_real_) ) ord <- H_ind |> group_by(paper) |> summarise(h_mean = mean(mid, na.rm = TRUE), .groups = "drop") |> arrange(desc(h_mean)) |> mutate(pos = row_number()) H_plot <- H_ind |> inner_join(ord, by = "paper") |> group_by(paper) |> mutate( off = (row_number() - (n() + 1) / 2) * 0.18, x = pos + off ) |> ungroup() L_c <- llm_metrics |> filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |> group_by(paper) |> summarise( mid = mean(midpoint, na.rm = TRUE), lo = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)), hi = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)), .groups = "drop" ) |> inner_join(ord, by = "paper") |> mutate(x = pos - 0.18) hbar <- mean(ord$h_mean, na.rm = TRUE) lbar <- mean(L_c$mid, na.rm = TRUE) # Legend labels embed grand means so the annotation text can be dropped lbl_human <- sprintf("Human evaluators (mean = %.1f)", hbar) lbl_gpt <- sprintf("GPT-5 Pro (mean = %.1f)", lbar) leg_colors <- setNames(c(UJ_GREEN, UJ_ORANGE), c(lbl_human, lbl_gpt)) leg_shapes <- setNames(c(16L, 18L), c(lbl_human, lbl_gpt)) p_forest <- if (nrow(L_c) > 0) { ggplot() + geom_vline(data = ord, aes(xintercept = pos), color = "grey92", linewidth = 0.3) + geom_hline(yintercept = hbar, color = UJ_GREEN, linetype = "dotted", linewidth = 0.8) + geom_hline(yintercept = lbar, color = UJ_ORANGE, linetype = "dotted", linewidth = 0.8) + geom_errorbar( data = subset(H_plot, is.finite(lo) & is.finite(hi)), aes(x = x, ymin = lo, ymax = hi), width = 0, linewidth = 1, alpha = 0.5, color = UJ_GREEN ) + geom_point(data = H_plot, aes(x = x, y = mid, color = lbl_human, shape = lbl_human), size = 3.0, alpha = 0.9) + geom_errorbar( data = subset(L_c, is.finite(lo) & is.finite(hi)), aes(x = x, ymin = lo, ymax = hi), width = 0, linewidth = 1.0, color = UJ_ORANGE ) + geom_point(data = L_c, aes(x = x, y = mid, color = lbl_gpt, shape = lbl_gpt), size = 3.6) + scale_color_manual(name = NULL, values = leg_colors, breaks = c(lbl_human, lbl_gpt)) + scale_shape_manual(name = NULL, values = leg_shapes, breaks = c(lbl_human, lbl_gpt)) + guides( color = guide_legend(override.aes = list(size = 3.8, alpha = 1)), shape = guide_legend(override.aes = list(size = 3.8)) ) + scale_x_continuous( breaks = ord$pos, labels = ord$paper, expand = expansion(mult = c(0.01, 0.03)) ) + coord_cartesian(ylim = c(0, 100), clip = "off") + labs(x = NULL, y = "Percentile (0\u2013100)", tag = "(a)") + theme_uj() + theme( axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 9), axis.title.y = element_text(size = 12), panel.grid.major.x = element_blank(), plot.margin = margin(5, 40, 5, 5), plot.tag = element_text(face = "bold", size = 14), legend.position = "bottom", legend.direction = "horizontal", legend.text = element_text(size = 10), legend.key.size = unit(0.5, "cm"), legend.spacing.x = unit(0.5, "cm") ) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void() } # ── Panel (b): Human-Human scatter ── # All pairwise evaluator combinations for papers with ≥2 human raters hh_scatter_data <- metrics_human |> filter(criteria == "overall") |> select(paper, evaluator, mid) |> distinct() |> group_by(paper) |> filter(n() >= 2) |> group_modify(\(df, key) { idx <- combn(nrow(df), 2) tibble(mid_a = df$mid[idx[1, ]], mid_b = df$mid[idx[2, ]]) }) |> ungroup() p_hh <- if (nrow(hh_scatter_data) > 0) { hh_rho <- cor(hh_scatter_data$mid_a, hh_scatter_data$mid_b, method = "spearman", use = "complete.obs") hh_lbl <- sprintf("Spearman \u03c1 = %.2f\nN = %d pairs", hh_rho, nrow(hh_scatter_data)) ggplot(hh_scatter_data, aes(x = mid_a, y = mid_b)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(color = UJ_GREEN, size = 3, alpha = 0.7) + annotate("text", x = 4, y = 96, label = hh_lbl, hjust = 0, vjust = 1, size = 3.4, color = "grey30") + coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) + labs(x = "Evaluator A (overall)", y = "Evaluator B (overall)", tag = "(b)") + theme_uj() + theme(plot.tag = element_text(face = "bold", size = 14)) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void() } # ── Panel (c): LLM vs Human scatter ── scatter_data <- llm_metrics |> filter(criteria == "overall", model == "GPT-5 Pro") |> inner_join(human_avg, by = "paper") |> mutate( diff = midpoint - human_mid, paper_short = str_trunc(paper, 25), human_lo = coalesce(human_lo, human_mid), human_hi = coalesce(human_hi, human_mid), lower_bound = coalesce(lower_bound, midpoint), upper_bound = coalesce(upper_bound, midpoint) ) |> filter(!is.na(human_mid), !is.na(midpoint)) # Per-model Spearman rho for facet annotations scatter_rho <- scatter_data |> group_by(model) |> summarise( rho = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), .groups = "drop" ) |> mutate(lbl = sprintf("Spearman \u03c1 = %.2f", rho), x = 4, y = 96) p_scatter <- if (nrow(scatter_data) > 0) { rho_lbl <- sprintf("Spearman \u03c1 = %.2f", scatter_rho$rho[1]) ggplot(scatter_data, aes(x = human_mid, y = midpoint)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(size = 3.5, alpha = 0.7, color = UJ_ORANGE) + ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8) + annotate("text", x = 4, y = 96, label = rho_lbl, hjust = 0, vjust = 1, size = 3.4, color = "grey30") + coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) + labs(x = "Human mean rating (0\u2013100)", y = "GPT-5 Pro rating (0\u2013100)", tag = "(c)") + theme_uj() + theme( plot.tag = element_text(face = "bold", size = 14) ) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "No matching data") + theme_void() } # ── Combine ── bottom_row <- wrap_plots(p_hh, p_scatter, widths = c(1, 1)) (p_forest / bottom_row) + plot_layout(heights = c(1, 0.85)) ``` Panel (b) makes the human-human ceiling directly visible: the scatter of evaluator pairs is no tighter than the GPT-5 Pro--human scatter in panel (c), and individual pairs disagree by as much as 30--40 percentile points on individual papers. GPT-5 Pro clusters around the identity line in the 40--80 range but diverges more at the extremes, compressing ratings toward the centre of the scale relative to humans---a pattern consistent with alignment training that discourages extreme outputs. Where humans rate a paper very highly or very harshly, the LLM typically pulls toward the middle. Full agreement metrics for all six models appear in [Appendix A](results_ratings.qmd).  **Rating differences by paper and criterion.** @fig-gap-heatmap-main unpacks Human − GPT-5 Pro differences across all seven criteria simultaneously. Each column is a paper, each row a criterion, and tile colour encodes the signed difference (human mean minus GPT-5 Pro midpoint). Green tiles indicate the human panel rated the paper higher; orange tiles indicate GPT-5 Pro was more generous. Papers are sorted by overall difference. ```{r} #| label: fig-gap-heatmap-main #| fig-cap: "Human minus GPT-5 Pro rating difference for every paper (columns) and criterion (rows). Green tiles indicate the human mean was higher; orange tiles indicate GPT-5 Pro rated the paper higher. Colour is clamped to \u00b130 percentile points. Papers are sorted by overall difference." #| fig-width: 12 #| fig-height: 7 #| code-fold: true library("patchwork") metric_order <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") metric_lab <- c( overall = "Overall", claims = "Claims & Evidence", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) H_mean <- metrics_human |> filter(criteria %in% metric_order, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(h = mean(mid, na.rm = TRUE), .groups = "drop") # Fixed paper order based on GPT-5 Pro overall difference L_mean_primary <- llm_metrics |> filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop") Ddiff_primary <- inner_join(H_mean, L_mean_primary, by = c("paper", "criteria")) |> mutate(diff = h - l) ord_p <- Ddiff_primary |> filter(criteria == "overall") |> arrange(desc(diff)) |> pull(paper) build_gap_panel <- function(model_name, show_y_labels = TRUE, llm_col = UJ_ORANGE) { L_mean_m <- llm_metrics |> filter(criteria %in% metric_order, model == model_name, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop") Ddiff_m <- inner_join(H_mean, L_mean_m, by = c("paper", "criteria")) |> mutate( diff = h - l, crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order]) ) p <- ggplot(Ddiff_m, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) + geom_tile(color = "white", linewidth = 0.25) + scale_fill_gradient2( low = llm_col, mid = "grey95", high = UJ_GREEN, midpoint = 0, name = "Human \u2212 LLM", limits = c(-30, 30), oob = scales::squish ) + labs(x = NULL, y = NULL, title = model_name) + theme_uj() + theme( axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 8), panel.grid = element_blank() ) if (show_y_labels) { p <- p + theme(axis.text.y = element_text(size = 11)) } else { p <- p + theme(axis.text.y = element_blank()) } p } if (length(ord_p) > 0) { build_gap_panel("GPT-5 Pro", show_y_labels = TRUE, llm_col = UJ_ORANGE) + theme(legend.position = "bottom") } else { cat("Insufficient data for gap heatmap.\n") } ``` Rows with uniformly green or orange tiles indicate papers where humans and GPT-5 Pro disagree systematically across all criteria, not just on one dimension. Columns with consistent colour suggest criteria-level biases---for instance, if "Open Science" is green across nearly all papers, humans may systematically reward data-sharing practices more than the model does. Multi-model comparisons in [Appendix A](results_ratings.qmd) reveal whether these disagreement patterns are GPT-5 Pro--specific or shared across frontier LLMs. **Qualitative critique comparison.** Beyond numeric ratings, we compare the substantive critiques each model raises against the consensus issues identified by human experts. The figure below plots coverage---the fraction of human-identified concerns that the LLM also raised in some form---against precision---the fraction of LLM-raised issues that correspond to a substantive human concern. These metrics are assessed by GPT-5.2 Pro acting as an independent judge (see [Methods](methods.qmd) for the judging protocol). ```{r} #| label: load-critique-data #| code-fold: true #| code-summary: "Load critique comparison data" #| message: false comparison_results_file <- c( "results/key_issue_comp_results.json", "results/key_issues_comparison_results.json" ) comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1] has_critique_data <- FALSE if (!is.na(comparison_results_file)) { comparison_file <- "results/key_issues_comparison.json" if (file.exists(comparison_file)) { comparison_data <- fromJSON(comparison_file) llm_results_raw <- fromJSON(comparison_results_file) llm_results <- llm_results_raw |> as_tibble() |> unnest_wider(comparison) |> select( gpt_paper, coverage_pct, precision_pct, any_of(c("matched_pairs", "unmatched_human", "unmatched_llm")), any_of(c("missed_issues", "extra_issues")), overall_rating, overall_justification, detailed_notes ) comparison_data <- comparison_data |> left_join(llm_results, by = "gpt_paper") has_critique_data <- nrow(comparison_data) > 0 && "coverage_pct" %in% names(comparison_data) && any(!is.na(comparison_data$coverage_pct)) } } ``` ```{r} #| label: fig-coverage-precision-main #| fig-cap: "Coverage (% of human-identified issues that the LLM also raised) vs Precision (% of LLM-raised issues that match a substantive human concern) for each paper, as assessed by GPT-5.2 Pro acting as judge. Dashed lines mark the cross-paper mean for each axis. Papers in the upper-right quadrant show strong alignment between human and LLM critiques." #| fig-width: 10 #| fig-height: 7 if (has_critique_data) { crit_results <- comparison_data |> filter(!is.na(coverage_pct) & !is.na(precision_pct)) |> mutate(paper_short = str_trunc(gpt_paper, 25)) ggplot(crit_results, aes(x = coverage_pct, y = precision_pct)) + geom_point(size = 4.5, color = UJ_ORANGE, alpha = 0.85) + geom_text(aes(label = paper_short), hjust = -0.1, vjust = 0.5, size = 2.6, check_overlap = TRUE) + geom_vline(xintercept = mean(crit_results$coverage_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) + geom_hline(yintercept = mean(crit_results$precision_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) + scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) + labs( x = "Coverage (%) \u2014 Human issues captured by LLM", y = "Precision (%) \u2014 LLM issues that are substantive" ) + theme_uj() } ``` Coverage and precision vary substantially across papers. Some papers achieve high scores on both dimensions, indicating strong alignment between human and LLM critiques, while others reveal the LLM missing key human concerns or raising issues absent from the expert consensus. Because these metrics are themselves LLM-assessed, they should be interpreted with the caveat that an LLM judge may systematically over- or under-credit matches relative to a human annotator; manual validation through our annotation tool is underway and preliminary results are consistent with the automated scores. Detailed paper-by-paper comparisons---including matched issue pairs with severity labels, structural difference tables, and per-evaluator breakdowns---appear in [Appendix B: Critiques & Key Issues](results_critiques.qmd). Extended quantitative analysis including all six models, per-criterion correlations, bootstrap confidence intervals, tier prediction accuracy, and cost-quality trade-offs is reported in [Appendix A: Results Ratings](results_ratings.qmd). The full LLM reasoning traces and assessment summaries are available in [Appendix C: LLM Traces](appendix_llm_traces.qmd). **Agreement summary.** @tbl-model-agreement-main shows GPT-5 Pro's agreement with the human mean on overall ratings, alongside the Human–Human baseline as a reference. The Spearman-Brown adjusted column (ρ adj.) is the appropriate comparator to ρ_HH; see Appendix A for all six models. Full model comparison is in [Appendix A](results_ratings.qmd). ```{r} #| label: tbl-model-agreement-main #| tbl-cap: "GPT-5 Pro agreement with human mean overall rating (N = `r n_focal` matched papers). **Human–Human** row (bold) shows pairwise individual-vs-individual Spearman *ρ* as a reference. **ρ (vs. mean)**: raw Spearman *ρ* between GPT-5 Pro and human *mean*—upward-biased because the mean suppresses noise. **ρ adj. (SB)**: Spearman-Brown corrected to individual-rater-equivalent (×`r sprintf('%.2f', sb_factor)` for k≈`r round(k_raters, 1)` raters/paper); compare this to Human–Human ρ = `r hh_spearman`. **95% CI**: Fisher-z. Bias = LLM − Human." #| code-fold: true model_compare_data <- llm_metrics |> filter(criteria == "overall", model == "GPT-5 Pro", paper %in% focal_papers) |> left_join( human_avg |> select(paper, human_mid), by = "paper" ) |> filter(!is.na(human_mid)) if (nrow(model_compare_data) > 0) { raw_rho <- cor(model_compare_data$human_mid, model_compare_data$midpoint, method = "spearman", use = "complete.obs") n_gpt <- nrow(model_compare_data) llm_row <- tibble( Model = "GPT-5 Pro", N = n_gpt, `ρ (vs. mean)` = sprintf("%.3f", raw_rho), `ρ adj. (SB)` = sprintf("%.3f", raw_rho * sb_factor), `95% CI` = spearman_ci95(raw_rho, n_gpt), `Pearson r` = round(cor(model_compare_data$human_mid, model_compare_data$midpoint, use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(model_compare_data$midpoint - model_compare_data$human_mid, na.rm = TRUE)), MAE = round(mean(abs(model_compare_data$midpoint - model_compare_data$human_mid), na.rm = TRUE), 1) ) hh_row_main <- tibble( Model = "Human–Human", N = hh_n, `ρ (vs. mean)` = sprintf("%.3f", hh_spearman), `ρ adj. (SB)` = "—", `95% CI` = spearman_ci95(hh_spearman, hh_n), `Pearson r` = round(hh_pearson, 3), `Mean bias` = hh_bias, MAE = hh_mae ) model_compare_tbl <- bind_rows(hh_row_main, llm_row) knitr::kable(model_compare_tbl, align = c("l", rep("r", ncol(model_compare_tbl) - 1)), booktabs = TRUE) |> kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0") } ``` @tbl-criteria-agreement-main breaks the same agreement metrics down by evaluation criterion for GPT-5 Pro. The **H-H ρ** column shows pairwise human-human Spearman ρ for each criterion as a reference — note that raw LLM ρ is upward-biased relative to H-H ρ by the mean-vs-individual asymmetry (see @tbl-model-agreement-main). Criteria where H-H ρ is itself low indicate genuine expert disagreement; low LLM agreement on those criteria is therefore expected rather than a model failure. ```{r} #| label: tbl-criteria-agreement-main #| tbl-cap: "GPT-5 Pro agreement with human mean by criterion (N = `r n_focal` matched papers). **H-H ρ** shows pairwise human evaluator Spearman *ρ* (the reference for each criterion; see @tbl-model-agreement-main note about mean-vs-individual bias). Positive bias = GPT-5 Pro rates higher on average. Note the variation: human evaluators agree strongly on some dimensions and barely at all on others (Open Science)." #| code-fold: true criteria_order <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") criteria_labels <- c( overall = "Overall", claims = "Claims & Evidence", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) # Human-human pairwise Spearman rho per criterion (ceiling reference) hh_crit_pairs <- metrics_human |> filter(criteria %in% criteria_order, paper %in% focal_papers) |> select(paper, criteria, evaluator, mid) |> distinct() |> group_by(paper, criteria) |> filter(n() >= 2) |> mutate(slot = paste0("E", row_number())) |> ungroup() |> pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |> filter(!is.na(mid_E1), !is.na(mid_E2)) hh_crit_rho <- hh_crit_pairs |> group_by(criteria) |> summarise( `H-H ρ` = round(cor(mid_E1, mid_E2, method = "spearman", use = "complete.obs"), 3), .groups = "drop" ) human_by_crit <- metrics_human |> filter(criteria %in% criteria_order, paper %in% focal_papers) |> group_by(paper, criteria) |> summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop") crit_compare_data <- llm_metrics |> filter(criteria %in% criteria_order, model == "GPT-5 Pro", paper %in% focal_papers) |> inner_join(human_by_crit, by = c("paper", "criteria")) if (nrow(crit_compare_data) > 0) { crit_tbl <- crit_compare_data |> group_by(criteria) |> summarise( `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3), `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)), RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1), .groups = "drop" ) |> left_join(hh_crit_rho, by = "criteria") |> mutate(Criterion = factor(criteria, levels = criteria_order, labels = criteria_labels[criteria_order])) |> select(Criterion, `H-H ρ`, `Spearman ρ`, `Pearson r`, `Mean bias`, RMSE) |> arrange(Criterion) knitr::kable(crit_tbl, align = c("l", rep("r", 5))) |> kableExtra::column_spec(2, bold = TRUE) } ``` ::: {.content-visible when-format="pdf"} All appendices, interactive figures, and full code are available at the companion website: <https://llm-uj-research-eval.netlify.app>. Source code and data are hosted at <https://github.com/valentinklotzbuecher/llm-uj-research-eval>. :::