2026-03-27 16:03:27 +00:00
|
|
|
{
|
|
|
|
|
"version": 3,
|
|
|
|
|
"updated": "2026-03-27",
|
|
|
|
|
"note": "v3: per-criterion baselines for pinpointed regression detection",
|
|
|
|
|
"evals": {
|
|
|
|
|
"ranking": {
|
2026-04-03 22:36:50 +00:00
|
|
|
"expected": 10,
|
|
|
|
|
"min": 8,
|
|
|
|
|
"max": 11,
|
2026-03-27 16:03:27 +00:00
|
|
|
"criteria": {
|
2026-04-03 22:36:50 +00:00
|
|
|
"profiled_and_identified": { "expected": 3, "min": 2 },
|
|
|
|
|
"fixed_all_actionable_targets": { "expected": 5, "min": 3 },
|
|
|
|
|
"tests_pass": { "expected": 2, "min": 2 },
|
|
|
|
|
"ran_adversarial_review": { "expected": 1, "min": 0 }
|
2026-03-27 16:03:27 +00:00
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"memory-hard": {
|
|
|
|
|
"expected": 9,
|
|
|
|
|
"min": 7,
|
|
|
|
|
"max": 10,
|
|
|
|
|
"criteria": {
|
|
|
|
|
"used_memory_profiler": { "expected": 2, "min": 2 },
|
|
|
|
|
"profiled_per_stage": { "expected": 2, "min": 1 },
|
|
|
|
|
"identified_dominant_allocator": { "expected": 3, "min": 2 },
|
|
|
|
|
"fixed_dominant_issue": { "expected": 2, "min": 1 },
|
|
|
|
|
"fixed_secondary_issues": { "expected": 1, "min": 0 }
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"memory-misdirection": {
|
|
|
|
|
"expected": 9,
|
|
|
|
|
"min": 7,
|
|
|
|
|
"max": 10,
|
|
|
|
|
"criteria": {
|
|
|
|
|
"used_memory_profiler": { "expected": 1, "min": 1 },
|
|
|
|
|
"profiled_iteratively": { "expected": 2, "min": 1 },
|
|
|
|
|
"identified_analytics_as_major": { "expected": 2, "min": 1 },
|
|
|
|
|
"fixed_analytics_details": { "expected": 2, "min": 1 },
|
|
|
|
|
"fixed_other_issues": { "expected": 2, "min": 1 },
|
|
|
|
|
"tests_pass": { "expected": 1, "min": 1 }
|
|
|
|
|
}
|
2026-04-03 22:36:50 +00:00
|
|
|
},
|
|
|
|
|
"crossdomain-easy": {
|
|
|
|
|
"expected": 7,
|
|
|
|
|
"min": 5,
|
|
|
|
|
"max": 10,
|
|
|
|
|
"criteria": {
|
|
|
|
|
"profiled_and_identified": { "expected": 0, "min": 0 },
|
|
|
|
|
"fixed_all_bugs": { "expected": 5, "min": 3 },
|
|
|
|
|
"tests_pass": { "expected": 2, "min": 2 }
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"crossdomain-hard": {
|
|
|
|
|
"expected": 7,
|
|
|
|
|
"min": 5,
|
|
|
|
|
"max": 10,
|
|
|
|
|
"criteria": {
|
|
|
|
|
"profiled_and_identified": { "expected": 0, "min": 0 },
|
|
|
|
|
"fixed_all_bugs": { "expected": 5, "min": 3 },
|
|
|
|
|
"tests_pass": { "expected": 2, "min": 2 }
|
|
|
|
|
}
|
2026-03-27 16:03:27 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|