perf: restructure getOptimizationPRs to limit before joining

The data query was LEFT JOINing optimization_features and repositories
across ALL matching candidate events before sorting and applying LIMIT.
For accounts with thousands of events, this meant joining and sorting
far more rows than needed.

Restructure both org and personal paths to use a two-phase CTE:
1. page_ids CTE: identify the page of event IDs using EXISTS for the
   PR filter (no full JOIN to optimization_features), sort, and LIMIT
2. Outer query: JOIN only the ~10 result IDs with optimization_features
   and repositories for display fields

Also remove the now-unused dataWhereClause variable.
This commit is contained in:
Kevin Turcios 2026-04-11 04:15:36 -05:00
parent d6cab273bc
commit ee535ae9bc

View file

@ -523,17 +523,6 @@ export async function getOptimizationPRs(
${prCondition}
`
const dataWhereClause = `
${accountCondition}
${eventTypeCondition}
${repositoryCondition}
AND oe.is_optimization_found = true
AND (
oe.pr_url IS NOT NULL
OR of.pull_request IS NOT NULL
)
`
const safePageSize = Math.trunc(pageSize)
const offset = Math.trunc((page - 1) * safePageSize)
@ -640,18 +629,43 @@ export async function getOptimizationPRs(
let dataSql: string
if ("orgId" in payload) {
// Two-phase: first identify the page of event IDs (cheap — no JOINs
// to optimization_features for display data), then JOIN only those IDs.
dataSql = `
WITH page_ids AS (
SELECT oe.id
FROM optimization_events oe
WHERE ${accountCondition}
${eventTypeCondition}
${repositoryCondition}
AND oe.is_optimization_found = true
AND (
oe.pr_url IS NOT NULL
OR EXISTS (
SELECT 1 FROM optimization_features of2
WHERE of2.trace_id = oe.trace_id
AND of2.pull_request IS NOT NULL
)
)
ORDER BY oe.created_at DESC
LIMIT ${safePageSize} OFFSET ${offset}
)
SELECT ${selectFields}
FROM optimization_events oe
INNER JOIN page_ids pi ON pi.id = oe.id
LEFT JOIN optimization_features of ON oe.trace_id = of.trace_id
LEFT JOIN repositories r ON oe.repository_id = r.id
WHERE ${dataWhereClause}
ORDER BY oe.created_at DESC
LIMIT ${safePageSize} OFFSET ${offset}
`
} else {
// Personal: CTE with UNION to identify candidate event IDs via index
// scans, then JOIN for the data fields (only for the LIMIT'd set).
// Personal: two-phase CTE approach to avoid joining large tables
// before sorting and limiting.
//
// Phase 1 (candidates): UNION for index-backed scans, carrying
// id + created_at + pr_url + trace_id for filtering and sorting.
// Phase 2 (page_ids): Filter for PR presence (pr_url OR optimization_features),
// sort by created_at DESC, and LIMIT — so the expensive JOINs only
// happen for the final page of results.
const uid = sqlUserId(payload.userId)
const uname = sqlUsername(payload.username)
const eventFilter =
@ -662,24 +676,34 @@ export async function getOptimizationPRs(
const branchFilters = `AND ${eventFilter} AND is_optimization_found = true ${repoFilter}`
dataSql = `
WITH candidate_ids AS (
SELECT id FROM optimization_events
WITH candidates AS (
SELECT id, created_at, pr_url, trace_id FROM optimization_events
WHERE repository_id IN (${repoIdsString}) ${branchFilters}
UNION
SELECT id FROM optimization_events
SELECT id, created_at, pr_url, trace_id FROM optimization_events
WHERE user_id = '${uid}' ${branchFilters}
UNION
SELECT id FROM optimization_events
SELECT id, created_at, pr_url, trace_id FROM optimization_events
WHERE current_username = '${uname}' ${branchFilters}
),
page_ids AS (
SELECT id
FROM candidates c
WHERE c.pr_url IS NOT NULL
OR EXISTS (
SELECT 1 FROM optimization_features of2
WHERE of2.trace_id = c.trace_id
AND of2.pull_request IS NOT NULL
)
ORDER BY c.created_at DESC
LIMIT ${safePageSize} OFFSET ${offset}
)
SELECT ${selectFields}
FROM optimization_events oe
INNER JOIN candidate_ids ci ON ci.id = oe.id
INNER JOIN page_ids pi ON pi.id = oe.id
LEFT JOIN optimization_features of ON oe.trace_id = of.trace_id
LEFT JOIN repositories r ON oe.repository_id = r.id
WHERE (oe.pr_url IS NOT NULL OR of.pull_request IS NOT NULL)
ORDER BY oe.created_at DESC
LIMIT ${safePageSize} OFFSET ${offset}
`
}