codeflash-agent/reports/unstructured-security/security_data.json

{
  "core_product_base": "https://github.com/Unstructured-IO/core-product/pull",
  "github_workflows_base": "https://github.com/Unstructured-IO/github-workflows/pull",
  "platform_libs_base": "https://github.com/Unstructured-IO/platform-libs/pull",
  "unstructured_base": "https://github.com/Unstructured-IO/unstructured/pull",
  "unstructured_inference_base": "https://github.com/Unstructured-IO/unstructured-inference/pull",
  "audit_metadata": {
    "date": "2026-04-16",
    "repos_audited": 8,
    "repos": [
      "core-product",
      "github-workflows",
      "platform-libs",
      "unstructured",
      "unstructured-inference",
      "unstructured-od-models",
      "unstructured-python-client",
      "unstructured.pytesseract"
    ],
    "workflow_files_audited": 69,
    "dockerfiles_audited": 6,
    "action_references_audited": 446,
    "categories_audited": [
      "Supply Chain",
      "CI/CD Security",
      "Docker/Container Security",
      "Code-Level Vulnerabilities",
      "Secrets Management",
      "Build Reproducibility",
      "Vulnerability Management",
      "PKI/Cryptography"
    ]
  },
  "findings": [
    {
      "id": "SEC-001",
      "title": "Lockfile Bypass via uv pip install in Docker Builds",
      "severity": "critical",
      "category": "Supply Chain",
      "status": "partially-fixed",
      "fixed_by": "Unstructured (PR #1465)",
      "repo": "core-product",
      "duration_exposed": "~2 months (core path); ongoing (residual)",
      "description": "Core-product's uv migration used `uv pip install` for ALL Docker build dependencies, completely bypassing the lockfile. Unstructured fixed the primary dependency tree by switching to `uv sync --locked` (PR #1465). However, `uv pip install` is still used in Docker builds for 3 package groups outside the main lockfile: inference proprietary deps (resolved from a separate pyproject.toml at build time), upstream packages installed with --no-deps, and custom OpenCV FIPS-compliant wheels. Additionally, 6+ CI workflow steps and 2 Makefile targets still use `uv pip install` for tooling (pip, setuptools, wheel, types-requests, boto3, awscli, ruff).",
      "impact": [
        "FIXED: Main dependency tree now uses uv sync --locked — builds are reproducible for core packages",
        "REMAINING: Inference deps (unstructured_inference_prop/pyproject.toml) are resolved at build time, not from a lockfile — different builds can get different inference dependency versions",
        "REMAINING: 6+ CI workflow steps install packages via uv pip install outside any lockfile (ci.yml:67,282,309,370; daily-metric.yml:136; process-render-only-files.yml:125)",
        "REMAINING: Makefile install target uses uv pip install --no-deps for upstream versions (line 36)",
        "torch CUDA variant issue is resolved; original 15+ workflow patterns reduced but not eliminated"
      ],
      "context": "The critical path (production Docker image core dependencies) is fixed. The remaining uv pip install usage is a reduced but real risk — inference dependencies in Docker builds are still resolved non-deterministically, and CI tooling versions are unpinned."
    },
    {
      "id": "SEC-002",
      "title": "CVE Remediation Not Reaching Production",
      "severity": "high",
      "category": "Vulnerability Management",
      "status": "partially-fixed",
      "fixed_by": "Unstructured (PRs #1423, #1434, #1437, #1465)",
      "repo": "core-product",
      "duration_exposed": "Unknown (core path fixed); ongoing (inference deps)",
      "description": "Because of the lockfile bypass (SEC-001), CVE remediation PRs merged into main never changed what was installed in production Docker images. The core dependency tree now uses uv sync --locked, so CVE fixes to packages in uv.lock DO reach production. However, inference dependencies installed via `uv pip install -r unstructured_inference_prop/pyproject.toml` are still resolved at build time — CVE fixes to inference dependencies (torch, transformers, onnxruntime, etc.) may still not deploy as expected.",
      "impact": [
        "FIXED: CVE remediation for packages in the main uv.lock now reaches production images",
        "REMAINING: CVE fixes to inference dependencies (installed from unstructured_inference_prop/pyproject.toml without a lockfile constraint) are resolved at build time — Renovate/Dependabot fixes may be overridden by pip resolution",
        "REMAINING: CI workflow dependencies (boto3, awscli, types-requests) are installed without version pins — vulnerable versions could be pulled",
        "Original issues (CVE-2026-28351 pypdf, PR #1423, #1434, #1437) are resolved"
      ],
      "context": "The core fix works. The residual risk is narrower — limited to inference deps and CI tooling — but inference packages (torch, transformers, onnxruntime) are high-value targets for supply chain attacks."
    },
    {
      "id": "SEC-003",
      "title": "pip.conf Bypass in CI Workflows",
      "severity": "medium",
      "category": "Build Pipeline",
      "status": "fixed",
      "fixed_by": "Codeflash (PR #361) — incidental fix during uv workspace migration",
      "repo": "github-workflows",
      "description": "The shared github-workflows configure-pypi step writes a .venv/pip.conf at the work-dir level, but in uv workspace mode the .venv is at the repo root. This caused private Azure DevOps PyPI index credentials to be written to the wrong location.",
      "impact": [
        "Internal packages (utic-instrumentation, unstructured-prompts) failed to resolve in uv workspace mode",
        "Workaround required environment variable credential injection",
        "Blocked uv workspace migration for platform-libs"
      ],
      "context": "Discovered while building the uv workspace POC for platform-libs."
    },
    {
      "id": "SEC-004",
      "title": "No uv Version Pinning Across Repos",
      "severity": "medium",
      "category": "Build Reproducibility",
      "status": "open",
      "repo": "org-wide",
      "duration_exposed": "Ongoing",
      "description": "None of Unstructured's 7 repos use required-version in [tool.uv]. CI uses setup-uv versions ranging from v5 to v7 across repos. Different uv versions can produce different lockfile resolutions.",
      "impact": [
        "Different developers on different uv versions can produce different lockfiles",
        "CI uses setup-uv versions ranging from v5 to v7 — inconsistent resolution behavior",
        "No .gitattributes or documented process for handling uv.lock merge conflicts (main repo has a 1.4 MB lockfile)",
        "Version constraint mismatch: unstructured-inference requires Python >=3.12,<3.13 while unstructured allows >=3.11"
      ],
      "context": "Creates a vector for lockfile divergence. When two developers generate different lockfiles, one may inadvertently introduce a vulnerable dependency version."
    },
    {
      "id": "SEC-005",
      "title": "uv pip Resolution Differences (Dependency Confusion)",
      "severity": "medium",
      "category": "Supply Chain",
      "status": "open",
      "repo": "unstructured",
      "description": "numba>=0.60.0 was resolved by uv to version 0.53.1 (Python <3.10 only), while pip correctly picked 0.63.1 (Python 3.12 compatible). The resolution behavior differences between uv and pip are a real risk during migration.",
      "impact": [
        "Wrong package version installed silently — no error, just incorrect behavior",
        "uv's resolution algorithm differs from pip in edge cases involving Python version markers",
        "Could lead to installing packages with known vulnerabilities if the wrong version is selected"
      ],
      "context": "Discovered during the unstructured repo's uv migration. This class of bug is subtle and hard to catch."
    },
    {
      "id": "SEC-006",
      "title": "Socket Security Org-Level Ruleset",
      "severity": "info",
      "category": "Positive Control",
      "status": "active",
      "repo": "org-wide",
      "description": "Unstructured-IO has an org-level GitHub ruleset (ID 14342252) requiring the Socket Security GitHub App to scan all PRs for supply chain security issues.",
      "impact": [
        "All PRs across the org are scanned for dependency supply chain issues",
        "Cannot be disabled at the repo level — enforced org-wide",
        "Catches new vulnerable dependencies before they reach main"
      ],
      "context": "Effective control. However, it only covers new PRs — does not retroactively scan existing dependencies, and cannot catch the lockfile bypass issue (SEC-001) since uv pip install doesn't change the lockfile."
    },
    {
      "id": "SEC-007",
      "title": "Claude Code Action Triggered by Any GitHub Commenter",
      "severity": "critical",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "org-wide (5 repos)",
      "files": [
        "unstructured/.github/workflows/claude.yml",
        "core-product/.github/workflows/claude.yml",
        "unstructured-inference/.github/workflows/claude.yml",
        "unstructured-python-client/.github/workflows/claude.yml",
        "platform-libs/.github/workflows/claude.yaml"
      ],
      "description": "The Claude Code Action triggers on `issue_comment` for ANY user who comments `@claude` on any issue or PR. The workflow receives the ANTHROPIC_API_KEY secret and runs Claude with Bash tool access — a prompt injection and secret exfiltration vector.",
      "impact": [
        "Any GitHub user can trigger the workflow by commenting @claude on any public issue/PR",
        "The GH_ANTHROPIC_API_KEY secret is exposed to the workflow environment",
        "Claude follows attacker-controlled instructions from the comment body",
        "Prompt injection + secret exfiltration vector",
        "No actor/association check limits who can trigger it"
      ],
      "context": "Present in all 5 repos with Claude Code Action. Permissions are read-only which limits blast radius, but the API key is still exposed."
    },
    {
      "id": "SEC-008",
      "title": "Workflow Injection via Unsanitized release_name Input",
      "severity": "critical",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "github-workflows",
      "files": ["github-workflows/.github/workflows/create-release.yml:155"],
      "description": "The `release_name` workflow input is a free-text string injected directly into a `run:` shell block without sanitization. The workflow also has access to a GitHub App private key token (UTIC_GITHUB_CICD_TOKEN_GENERATOR_PRIVATE_KEY), making this a secret exfiltration vector via crafted release names.",
      "impact": [
        "Any user with write access can inject arbitrary shell commands via a crafted release name",
        "GitHub App private key accessible in the same workflow context",
        "When called via workflow_call, bump_type becomes a string type with no validation (workflow_dispatch constrains it to a choice)",
        "Shared workflow used by multiple repos — blast radius extends across the org"
      ],
      "context": "Classic workflow injection pattern — direct ${{ }} interpolation of user input in run: blocks."
    },
    {
      "id": "SEC-009",
      "title": "pip install from Third-Party Chinese PyPI Mirror in Docker Build",
      "severity": "critical",
      "category": "Supply Chain",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/triton-server/Dockerfile:20"],
      "description": "The triton-server Dockerfile installs paddlepaddle-gpu from https://pypi.tuna.tsinghua.edu.cn/simple — a third-party Chinese university mirror instead of official PyPI. No hash verification is performed on the downloaded package.",
      "impact": [
        "Third-party mirror could serve modified/trojanized packages without detection",
        "No --require-hashes flag — no integrity verification",
        "Triton server processes sensitive customer documents in regulated industries",
        "A compromised mirror is a single point of supply chain failure"
      ],
      "context": "Supply chain risk for a document processing pipeline handling HIPAA, SOC 2, PCI-DSS regulated data."
    },
    {
      "id": "SEC-010",
      "title": "Private PyPI Credentials Persisted in GitHub Actions Cache",
      "severity": "critical",
      "category": "Secrets Management",
      "status": "open",
      "repo": "github-workflows",
      "files": ["github-workflows/.github/workflows/setup-legacy-environment.yml:84-101"],
      "description": "PRIVATE_PYPI_INDEX_URL (containing embedded Azure DevOps password in the URL) is written to .venv/pip.conf, then the entire .venv/ directory is cached by actions/cache. The credential persists in the GitHub Actions cache across workflow runs.",
      "impact": [
        "Azure DevOps PAT/token persists beyond the workflow run in the actions cache",
        "Any subsequent workflow run that restores this cache can read the credential",
        "Credential potentially accessible to fork PR workflows if cache key matches",
        "Violates the principle that secrets should be ephemeral"
      ],
      "context": "The credential is an Azure DevOps access token for the private unstructured package index."
    },
    {
      "id": "SEC-011",
      "title": "ECR Password Passed as Unmasked Job Output",
      "severity": "critical",
      "category": "Secrets Management",
      "status": "open",
      "repo": "core-product",
      "files": [
        "core-product/.github/workflows/ci.yml:198",
        "core-product/.github/workflows/daily-metric.yml:94"
      ],
      "description": "AWS ECR login password (temporary auth token) is written to $GITHUB_OUTPUT without masking, then consumed by downstream jobs. The token appears in plaintext in workflow logs visible to anyone with repo read access.",
      "impact": [
        "ECR tokens grant pull/push access to all private container images in the registry",
        "Visible in plaintext in workflow logs",
        "Short-lived (~12 hours) but fully exploitable within that window",
        "Token passed across job boundaries in cleartext via GITHUB_OUTPUT"
      ],
      "context": "ECR contains production container images for the document processing pipeline."
    },
    {
      "id": "SEC-012",
      "title": "Three Docker Images Run as Root",
      "severity": "critical",
      "category": "Docker/Container Security",
      "status": "open",
      "repo": "core-product, github-workflows",
      "files": [
        "core-product/triton-server/Dockerfile (no USER directive anywhere)",
        "core-product/unstructured-api/Dockerfile.ci:40 (USER root as final directive)",
        "github-workflows/utic-build-tools/Dockerfile (no USER in runtime stage)"
      ],
      "description": "Three Docker images run as root by default: the triton ML inference server, the CI build image, and the org-wide build tools image. A container escape or application vulnerability gives the attacker root on the host.",
      "impact": [
        "Container escape gives attacker root on the host (depending on runtime config)",
        "Triton server processes customer documents — high-value target for data exfiltration",
        "CI image with root access is a prime vector for supply chain attacks",
        "Build tools image is used by all repos — root there compromises the entire CI pipeline"
      ],
      "context": "Critical for a company processing HIPAA/SOC 2 regulated data. Container isolation is a key defense layer."
    },
    {
      "id": "SEC-013",
      "title": "API Key Leaked in Error Response",
      "severity": "high",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/unstructured-api/prepline_general/api/general.py:691"],
      "description": "When an invalid API key is submitted, the server echoes the submitted key back in the error response body. If this response is logged by any intermediary (proxy, WAF, load balancer, SIEM) or shown to end users, the attempted key is exposed.",
      "impact": [
        "Submitted API keys visible in error responses and downstream logs",
        "HIPAA/SOC 2 compliance violation — credentials should never appear in logs or responses",
        "Enables key enumeration and information leakage attacks",
        "Any logging or monitoring system that captures 401 responses will store the key"
      ],
      "context": "The API key comparison also uses timing-unsafe != operator (see SEC-029)."
    },
    {
      "id": "SEC-014",
      "title": "Environment Variable Injection via User-Controlled OCR Agent",
      "severity": "high",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/unstructured-api/prepline_general/api/general.py:180-181"],
      "description": "The ocr_agent parameter comes from user form input with no validation and is set directly as os.environ['OCR_AGENT']. While table_ocr_agent has a validation gate, the ocr_agent parameter does NOT. Additionally, os.environ is process-global and not thread-safe — concurrent requests create a race condition where one request's OCR agent setting affects another request's processing.",
      "impact": [
        "Attacker can set OCR_AGENT to an arbitrary Python module path",
        "Process-global os.environ creates race conditions between concurrent requests",
        "One request's OCR agent setting can silently affect another customer's document processing",
        "Both a security vulnerability and a reliability/data-isolation concern"
      ],
      "context": "The race condition means this is both a security and reliability issue affecting multi-tenant document processing."
    },
    {
      "id": "SEC-015",
      "title": "SSRF via URL Parameters in Document Partitioning",
      "severity": "high",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "unstructured",
      "files": [
        "unstructured/unstructured/partition/auto.py:310",
        "unstructured/unstructured/partition/html/partition.py:161",
        "unstructured/unstructured/partition/md.py:97"
      ],
      "description": "partition(), partition_html(), and partition_md() accept user-controlled URLs and fetch them without any validation against internal/private IP ranges. An attacker can reach cloud metadata endpoints, internal services, or credential stores.",
      "impact": [
        "Access to cloud metadata endpoints (169.254.169.254) — exposes AWS/Azure credentials",
        "Internal service enumeration and data exfiltration",
        "md.py:97 is worst case — no timeout, no SSL verification option, completely unconstrained HTTP fetch",
        "Cloud environments (Azure VMs, AWS) expose sensitive credentials via metadata endpoints"
      ],
      "context": "Unstructured runs on Azure VMs and AWS infrastructure where metadata endpoints expose IAM credentials, subscription IDs, and other sensitive configuration."
    },
    {
      "id": "SEC-016",
      "title": "Gzip Decompression Bomb — No Size Limit",
      "severity": "high",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/unstructured-api/prepline_general/api/general.py:526-528"],
      "description": "A malicious user can upload a gzip bomb — a small compressed file (e.g., 10KB) that decompresses to gigabytes or terabytes. There is no limit on the decompressed size, and SpooledTemporaryFile has no max_size set. A single request can exhaust server memory/disk, causing denial of service against all users.",
      "impact": [
        "Denial of service via a single malicious request",
        "Server memory and disk exhaustion affects ALL users on the shared infrastructure",
        ".gz files are explicitly supported by the API — trivially exploitable",
        "No rate limiting or size cap on decompression"
      ],
      "context": "Trivially exploitable since .gz is an accepted upload format. A 42KB gzip bomb can decompress to 5.5 GB."
    },
    {
      "id": "SEC-017",
      "title": "Insecure Randomness for PKI Certificate Serial Numbers",
      "severity": "high",
      "category": "PKI/Cryptography",
      "status": "open",
      "repo": "platform-libs",
      "files": ["platform-libs/libs/pki/manual_scripts/root-management/sign_with_azure.py:43"],
      "description": "Certificate serial numbers use Python's `random` module (Mersenne Twister PRNG) which is deterministic and predictable. Certificate serials MUST be unpredictable per RFC 5280 and CA/Browser Forum Baseline Requirements. This is a PKI root management script — compromise here undermines the entire certificate trust chain.",
      "impact": [
        "Predictable certificate serial numbers — violates RFC 5280",
        "Violates CA/Browser Forum Baseline Requirements",
        "An attacker who observes serial numbers can predict future ones",
        "Predictable serials have been used in real attacks (e.g., 2008 Debian OpenSSL vulnerability)"
      ],
      "context": "The code comment on line 39-41 explicitly acknowledges timing attack concerns but uses the non-cryptographic random module anyway."
    },
    {
      "id": "SEC-018",
      "title": "12 CI Workflows Missing permissions: Declaration",
      "severity": "high",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "org-wide",
      "files": [
        "core-product/.github/workflows/docker-publish.yml",
        "core-product/.github/workflows/daily-metric.yml",
        "core-product/.github/workflows/ci-infra.yml",
        "core-product/.github/workflows/process-render-only-files.yml",
        "core-product/.github/workflows/triton.yml",
        "unstructured/.github/workflows/docker-publish.yml",
        "unstructured/.github/workflows/release-version-alert.yml",
        "unstructured/.github/workflows/codeflash.yml",
        "unstructured-od-models/.github/workflows/ci.yml",
        "unstructured-od-models/.github/workflows/release.yml",
        "unstructured-inference/.github/workflows/create_issue.yml",
        "unstructured.pytesseract/.github/workflows/ci.yaml"
      ],
      "description": "12 workflow files lack explicit permissions: declarations, inheriting the org/repo default (often write-all). Several are triggered by pull_request events from untrusted forks, meaning the write-scoped token could be abused.",
      "impact": [
        "GITHUB_TOKEN may get unnecessary write access to contents, packages, pull-requests, issues",
        "Violates principle of least privilege",
        "Increases blast radius of any workflow vulnerability",
        "Several are PR-triggered — untrusted code runs with elevated permissions"
      ],
      "context": "GitHub's security guidance recommends explicit minimal permissions on every workflow."
    },
    {
      "id": "SEC-019",
      "title": "secrets: inherit Passes All Secrets to Reusable Workflows",
      "severity": "high",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "org-wide (15+ instances)",
      "files": [
        "core-product/.github/workflows/publish.yml:195,205",
        "platform-libs/.github/workflows/ci.yml:142,155,176,185",
        "unstructured-od-models/.github/workflows/ci.yml:39,46",
        "github-workflows/.github/workflows/build.yml:158,177,192,205",
        "github-workflows/.github/workflows/auto-release-on-merge.yml:112"
      ],
      "description": "`secrets: inherit` passes ALL repository secrets to called workflows. If any reusable workflow (particularly from the shared github-workflows repo) is compromised or has a vulnerability, every secret in the calling repo is exposed.",
      "impact": [
        "Full secret exposure if any reusable workflow is compromised: AWS credentials, ACR passwords, Chainguard tokens, PyPI keys, Slack tokens, HuggingFace tokens, GitHub App private keys",
        "Violates principle of least privilege for secret access",
        "15+ instances across multiple repos",
        "Single compromised reusable workflow = full org secret breach"
      ],
      "context": "The shared github-workflows repo is the central CI infrastructure — it's the highest-value target."
    },
    {
      "id": "SEC-020",
      "title": "Dependency Confusion via extra-index-url for Private PyPI",
      "severity": "high",
      "category": "Supply Chain",
      "status": "open",
      "repo": "core-product, github-workflows",
      "files": [
        "github-workflows/.github/actions/configure-pypi/action.yml",
        "github-workflows/.github/actions/configure-pypi-oidc/action.yml",
        "core-product/.github/workflows/ci.yml (15+ instances)"
      ],
      "description": "Private packages use extra-index-url which checks BOTH public PyPI and the private Azure DevOps index, selecting the highest version. 13 private package names (utic-crypto, utic-instrumentation, utic-metrics, etc.) are unclaimed on public PyPI — an attacker could register them with higher version numbers to hijack installs.",
      "impact": [
        "Attacker registers utic-* on public PyPI with higher version number → installs malicious package instead of the real one",
        "13 private package names are unclaimed and vulnerable to squatting",
        "Only utic-public-types exists on public PyPI (legitimately published by Unstructured)",
        "Affects all CI builds and Docker builds that use extra-index-url"
      ],
      "context": "platform-libs already mitigates this with explicit = true on its private index. The pattern exists but isn't applied consistently."
    },
    {
      "id": "SEC-021",
      "title": "97% of GitHub Action References Use Mutable Pins",
      "severity": "high",
      "category": "Supply Chain",
      "status": "open",
      "repo": "org-wide",
      "files": ["All workflow files across 8 repos (446 action references audited)"],
      "description": "Of 446 action references audited org-wide: 147 SHA-pinned (33%), 203 tag-pinned (45.5%), 96 branch-pinned (21.5%). Independent VM verification of core-product (latest main) shows only 2 of 96 action references are SHA-pinned (2.1%) — 97% use mutable tags or branches.",
      "impact": [
        "Compromised upstream action silently executes malicious code in CI with access to all secrets",
        "ludeeus/action-shellcheck@master — individual developer's repo, ~400 stars, pinned to master branch",
        "anthropics/claude-code-action@beta — mutable branch pin used across 5 repos",
        "core-product has 94 mutable action references (97%) with access to 25+ distinct secrets including AWS, Azure, PyPI, HuggingFace, Anthropic, and Slack credentials"
      ],
      "context": "github-workflows repo is best at 60.6% SHA-pinned. Renovate is configured with helpers:pinGitHubActionDigests but only in the shared repo."
    },
    {
      "id": "SEC-022",
      "title": "Self-Hosted Runners Used for PR-Triggered Workflows",
      "severity": "high",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "core-product",
      "files": [
        "core-product/.github/workflows/ci.yml:42,76,89,105,202,254,313,414",
        "core-product/.github/workflows/daily-metric.yml:22,98,254",
        "core-product/.github/workflows/docker-publish.yml:84,187,250"
      ],
      "description": "CI workflows triggered by pull_request events run on self-hosted-xlarge runners. Self-hosted runners persist state between runs (Docker images, caches, credentials on disk, environment variables). An attacker opening a PR can execute arbitrary code on the runner.",
      "impact": [
        "Arbitrary code execution on persistent runner infrastructure via a malicious PR",
        "Access to secrets and credentials left on disk by previous workflow runs",
        "Potential to install persistent backdoors on the runner",
        "Lateral movement to AWS/Azure resources since runners have cloud credentials configured"
      ],
      "context": "The self-hosted-xlarge label may correspond to GitHub-managed larger runners (lower risk) or traditional persistent runners (high risk). Needs verification."
    },
    {
      "id": "SEC-023",
      "title": "Unpinned :latest Base Images in Production Dockerfiles",
      "severity": "high",
      "category": "Docker/Container Security",
      "status": "open",
      "repo": "unstructured, unstructured-inference",
      "files": [
        "unstructured/Dockerfile:1 (cgr.dev/chainguard/wolfi-base:latest)",
        "unstructured/Dockerfile:55 (ghcr.io/astral-sh/uv:latest)",
        "unstructured-inference/Dockerfile:11 (ghcr.io/astral-sh/uv:latest)"
      ],
      "description": "Production Dockerfiles use :latest tags for base images. Builds are non-reproducible — a compromised or buggy upstream image silently enters the supply chain. Docker-compose files also use :latest for infrastructure services (prometheus, grafana, minio, kafka, etc.).",
      "impact": [
        "Non-reproducible builds — same Dockerfile produces different images at different times",
        "Compromised upstream image silently enters the supply chain",
        "Chainguard wolfi-base and uv form the foundation of production images",
        ":latest tags in docker-compose affect infrastructure services (prometheus, grafana, minio)"
      ],
      "context": "Chainguard and Astral (uv) are reputable providers, but :latest is still a mutable reference that could be compromised or accidentally broken."
    },
    {
      "id": "SEC-024",
      "title": "Unquoted Secret Expansion in Shell Commands",
      "severity": "high",
      "category": "Secrets Management",
      "status": "open",
      "repo": "core-product",
      "files": [
        "core-product/.github/workflows/ci.yml:184",
        "core-product/.github/workflows/docker-publish.yml:152"
      ],
      "description": "PRIVATE_PYPI_INDEX_URL is expanded without quotes via ${{ secrets.PRIVATE_PYPI_INDEX_URL }} directly in a run: block. GitHub Actions performs string interpolation before the shell sees it — if the secret contains shell metacharacters, it enables command injection. The extracted password may also appear unmasked in logs since it doesn't match the original secret value.",
      "impact": [
        "Potential command injection if secret value contains shell metacharacters",
        "Password extracted by sed may not match GitHub's known secret value — appears unmasked in logs",
        "Process args visible in /proc on self-hosted runners",
        "Inconsistent — the correct pattern is used in an adjacent step in the same file"
      ],
      "context": "The correct pattern (env: indirection) is already used at line 178 in the same workflow — this is an inconsistency."
    },
    {
      "id": "SEC-025",
      "title": "Private PyPI URL Written to Disk on Self-Hosted Runners",
      "severity": "high",
      "category": "Secrets Management",
      "status": "open",
      "repo": "core-product",
      "files": [
        "core-product/.github/workflows/ci.yml:131,224,277,337",
        "core-product/.github/workflows/daily-metric.yml:58,130",
        "core-product/.github/workflows/docker-publish.yml:111,224",
        "core-product/.github/workflows/triton.yml:65"
      ],
      "description": "The full PRIVATE_PYPI_INDEX_URL (with embedded Azure DevOps credentials in user:TOKEN@host format) is written to ~/.pip/unstructured.conf on the runner filesystem. On self-hosted runners (which core-product uses extensively), this file persists between jobs without cleanup.",
      "impact": [
        "Azure DevOps credentials persist on self-hosted runner disk between workflow jobs",
        "15+ instances of this pattern across core-product workflows",
        "URL contains embedded user:TOKEN@host credentials in plaintext",
        "Combined with SEC-022 (self-hosted runner risk), credentials from previous runs are accessible to subsequent PR workflows"
      ],
      "context": "This compounds with the self-hosted runner issue (SEC-022) — a PR author's code runs on a runner that has credentials from previous builds on disk."
    },
    {
      "id": "SEC-026",
      "title": "curl Piped to Shell for Poetry Install in Build Tools",
      "severity": "medium",
      "category": "Docker/Container Security",
      "status": "open",
      "repo": "github-workflows",
      "files": ["github-workflows/utic-build-tools/Dockerfile:7"],
      "description": "Poetry is installed by piping curl output directly to python3. A DNS hijack, MITM, or compromised server delivers arbitrary code execution. The -s flag silences errors, hiding download failures.",
      "impact": [
        "Arbitrary code execution from a compromised download source",
        "No checksum verification on the downloaded installer",
        "-s flag hides download failures — silent compromise",
        "Build-tools image is used across all CI — compromising it affects every repo"
      ],
      "context": "The build-tools image is the foundation of CI across the org."
    },
    {
      "id": "SEC-027",
      "title": "Python 3.9 EOL Base Image in Build Tools",
      "severity": "medium",
      "category": "Docker/Container Security",
      "status": "open",
      "repo": "github-workflows",
      "files": ["github-workflows/utic-build-tools/Dockerfile:2,24"],
      "description": "Both builder and runtime stages use python:3.9-slim. Python 3.9 reached end-of-life in October 2025 — no more security patches are being issued.",
      "impact": [
        "No security patches for Python 3.9 vulnerabilities (CVEs go unpatched)",
        "Both builder and runtime stages affected",
        "Build tools image is shared across all CI pipelines"
      ],
      "context": "Python 3.9 EOL was October 2025. Current supported versions are 3.11, 3.12, 3.13."
    },
    {
      "id": "SEC-028",
      "title": "HuggingFace Token Residual Risk in Docker Image",
      "severity": "medium",
      "category": "Secrets Management",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/unstructured-api/Dockerfile:83-90"],
      "description": "While BuildKit --mount=type=secret is used correctly for the HF token, HuggingFace libraries may cache the token to disk during model downloads. The preload script cleans ~/.cache/huggingface/token but other cache paths may be missed. The app-start.sh script looks for ~/hf_token at runtime, implying deployment paths where the token file may be baked into the image.",
      "impact": [
        "HF token may persist in image layers via library-created cache directories",
        "Internal ticket CORE-4302 acknowledges this as a known issue",
        "Marketplace images (where external users access the image) are the highest risk",
        "Token provides access to gated models and potentially private model repos"
      ],
      "context": "BuildKit secret mounting is done correctly — the risk is from HuggingFace library behavior caching tokens in unexpected locations during model downloads."
    },
    {
      "id": "SEC-029",
      "title": "Timing-Unsafe API Key Comparison",
      "severity": "medium",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "core-product",
      "files": ["core-product/unstructured-api/prepline_general/api/general.py:689"],
      "description": "API key validation uses Python's != operator which performs byte-by-byte comparison that short-circuits on the first mismatch. An attacker can statistically determine the correct API key one character at a time by measuring response time differences across many requests.",
      "impact": [
        "Timing side-channel enables API key extraction",
        "Especially exploitable on low-latency internal networks",
        "Fully automatable — requires many requests but no special access",
        "Combined with SEC-013, provides two independent key extraction vectors"
      ],
      "context": "Standard mitigation is constant-time comparison, which Python provides in the hmac module."
    },
    {
      "id": "SEC-030",
      "title": "SHA-1 Used for Subject Key Identifier in PKI Tool",
      "severity": "medium",
      "category": "PKI/Cryptography",
      "status": "open",
      "repo": "platform-libs",
      "files": ["platform-libs/libs/pki/manual_scripts/root-management/sign_with_azure.py:72-74"],
      "description": "SHA-1 is used to hash the public key for the Subject Key Identifier extension. SHA-1 is cryptographically broken for collision resistance (SHAttered attack, 2017). Modern PKI guidance from NIST and the CA/Browser Forum recommends SHA-256.",
      "impact": [
        "May fail compliance audits that require modern cryptographic algorithms",
        "SHA-1 collision attacks are practical since 2017",
        "Poor practice for a newly-built CA root management tool"
      ],
      "context": "RFC 5280 historically specified SHA-1 for SKI, but modern standards recommend SHA-256."
    },
    {
      "id": "SEC-031",
      "title": "28+ Unbounded Dependencies in ML Packages",
      "severity": "medium",
      "category": "Supply Chain",
      "status": "open",
      "repo": "unstructured-inference, unstructured-od-models",
      "files": [
        "unstructured-inference/pyproject.toml (28 unbounded)",
        "unstructured-od-models/pyproject.toml (17 unbounded)"
      ],
      "description": "unstructured-inference has 28 dependencies with >= but no upper bound (e.g., torch>=2.10.0, transformers>=4.25.1, numpy>=1.26.0). unstructured-od-models has 17. The lockfile mitigates for uv sync, but anyone who pip-installs these packages gets unbounded resolution.",
      "impact": [
        "pip install gets unbounded dependency resolution — new major versions pulled in automatically",
        "Major version bumps could introduce breaking changes or vulnerabilities",
        "45 total unbounded deps across the two ML packages",
        "Contrast: the main unstructured repo correctly uses >=X, <Y ranges"
      ],
      "context": "The lockfile mitigates for internal builds, but published packages are installed by external users without the lockfile."
    },
    {
      "id": "SEC-032",
      "title": "Socket.dev Security Check is Non-Blocking",
      "severity": "medium",
      "category": "Supply Chain",
      "status": "open",
      "repo": "github-workflows",
      "files": ["github-workflows/.github/workflows/security.yaml"],
      "description": "The Socket.dev security check uses continue-on-error: true, meaning security scan failures don't block the build. A PR introducing a known-vulnerable dependency can still merge even if Socket flags it.",
      "impact": [
        "Security scanning failures are silently ignored in CI",
        "PRs with vulnerable dependencies can merge without manual review of the scan",
        "Defeats the purpose of automated security scanning",
        "Partially mitigated by the org-level Socket ruleset (SEC-006)"
      ],
      "context": "The org-level Socket ruleset may provide a separate blocking check, but the CI-level check explicitly allows failures."
    },
    {
      "id": "SEC-033",
      "title": "Committed RSA Private Key in Test Directory",
      "severity": "medium",
      "category": "Secrets Management",
      "status": "open",
      "repo": "platform-libs",
      "files": ["platform-libs/libs/storage/blob_storage_adapters/tests/pki/key.pem"],
      "description": "A 2048-bit RSA private key is committed to the repository. While it's a self-signed certificate for localhost, committed private keys are permanently in git history and set a precedent for credential handling.",
      "impact": [
        "Private key is permanently in git history — cannot be fully removed",
        "Sets a bad precedent for credential handling across the org",
        "If test infrastructure is reachable beyond localhost, enables MITM attacks"
      ],
      "context": "The corresponding certificate is self-signed for CN=localhost, O=unstructured."
    },
    {
      "id": "SEC-034",
      "title": "version-bump.yml Grants contents: write on pull_request",
      "severity": "medium",
      "category": "CI/CD Security",
      "status": "open",
      "repo": "core-product, unstructured-inference",
      "files": [
        "core-product/.github/workflows/version-bump.yml:8-10",
        "unstructured-inference/.github/workflows/version-bump.yml:8-10"
      ],
      "description": "Workflows triggered on pull_request grant contents: write at the workflow level. The job-level if: limits execution to Renovate bot PRs, but GitHub evaluates permissions before the if: condition. For same-repo branch PRs, the full write token is available. Also calls an external workflow pinned to @main (mutable).",
      "impact": [
        "Write-scoped GITHUB_TOKEN available for all same-repo branch PRs",
        "Permissions evaluated before job if: condition — the filter doesn't prevent token generation",
        "External workflow referenced at @main — mutable pin"
      ],
      "context": "For forked PRs, GitHub auto-downgrades to read-only, limiting the blast radius."
    },
    {
      "id": "SEC-035",
      "title": "Command Injection via Pulumi Resource ID in Minikube Provider",
      "severity": "low",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "platform-libs",
      "files": ["platform-libs/libs/cloud_abstractions/minikube/utic_cloud_provider_minikube/components/kubernetes.py:61-62,127,133-134"],
      "description": "exec_shell() splits f-strings on whitespace for subprocess.Popen. A Pulumi resource ID containing spaces could cause unintended argument injection, potentially leading to unintended cluster operations like deleting all minikube profiles.",
      "impact": [
        "Unintended cluster operations via crafted resource IDs",
        "Limited to local minikube dev environments",
        "Requires crafted Pulumi state"
      ],
      "context": "Low severity — minikube is local-dev only."
    },
    {
      "id": "SEC-036",
      "title": "shell=True in Release Script with Derived Paths",
      "severity": "low",
      "category": "Code-Level Vulnerability",
      "status": "open",
      "repo": "platform-libs",
      "files": ["platform-libs/scripts/release.py:54-59"],
      "description": "The release script uses shell=True with an f-string containing a project name derived from git diff output. A maliciously-named directory under libs/ could inject shell commands. The command chain includes twine upload, making it a supply-chain risk if exploited.",
      "impact": [
        "Shell command injection via maliciously-named directory",
        "twine upload is in the command chain — could publish malicious packages",
        "Requires write access to the repo to create the directory"
      ],
      "context": "Low probability but high impact if exploited — the release pipeline is the most sensitive part of any supply chain."
    },
    {
      "id": "SEC-037",
      "title": ".gitignore Missing Critical Patterns Across All Repos",
      "severity": "low",
      "category": "Secrets Management",
      "status": "open",
      "repo": "org-wide",
      "files": ["All 8 repos' .gitignore files"],
      "description": "All repos are missing gitignore patterns for: *.pem, *.key, *.p12, pip.conf, .pypirc, .netrc, credentials.json, service-account.json. The github-workflows repo is missing .env entirely (minimal .gitignore with only 6 lines).",
      "impact": [
        "Accidental commit of cryptographic material or credential files",
        "Defense-in-depth failure — gitignore is a safety net",
        "SEC-033 (committed private key) demonstrates the real risk"
      ],
      "context": "No additional leaks found from these gaps beyond SEC-033, but the gap is a persistent risk."
    },
    {
      "id": "SEC-038",
      "title": "No HEALTHCHECK in Any Production Dockerfile",
      "severity": "low",
      "category": "Docker/Container Security",
      "status": "open",
      "repo": "org-wide",
      "files": ["All 6 Dockerfiles"],
      "description": "No Dockerfiles include HEALTHCHECK directives. Without healthchecks, container orchestrators (Kubernetes, Docker Swarm) cannot distinguish a running-but-unhealthy container from a healthy one. A hung process continues receiving traffic.",
      "impact": [
        "Hung or deadlocked containers continue receiving customer traffic",
        "Slower incident detection and recovery",
        "Kubernetes readiness probes may be configured separately, but the Dockerfile is the canonical definition"
      ],
      "context": "Some docker-compose files have healthchecks for auxiliary services, but the production Dockerfiles (API, triton) do not."
    },
    {
      "id": "SEC-039",
      "title": "No CodeQL/SAST Coverage for 7 of 8 Repos",
      "severity": "low",
      "category": "Vulnerability Management",
      "status": "open",
      "repo": "org-wide",
      "files": ["Only unstructured/.github/workflows/codeql-analysis.yml exists"],
      "description": "GitHub CodeQL static analysis is only configured for the main unstructured repo. The other 7 repos — including core-product (the API that processes sensitive documents) and platform-libs (PKI, secrets management, cloud abstractions) — have no static application security testing.",
      "impact": [
        "Code-level vulnerabilities go undetected by automated tools in 7 of 8 repos",
        "core-product (the public-facing API) and platform-libs (PKI/secrets) are the highest-risk repos without SAST",
        "Anchore/Grype covers container image scanning but not source code analysis"
      ],
      "context": "The findings in this audit (SEC-013 through SEC-017) demonstrate the kinds of vulnerabilities SAST would catch."
    }
  ],
  "summary": {
    "total_findings": 39,
    "critical": 7,
    "high": 13,
    "medium": 10,
    "low": 5,
    "info": 1,
    "positive_controls": 1,
    "fixed_by_codeflash": 0,
    "open": 33,
    "active_controls": 1,
    "by_category": {
      "Supply Chain": 7,
      "CI/CD Security": 7,
      "Docker/Container Security": 5,
      "Code-Level Vulnerability": 6,
      "Secrets Management": 7,
      "Build Reproducibility": 1,
      "Vulnerability Management": 2,
      "PKI/Cryptography": 2,
      "Build Pipeline": 1,
      "Positive Control": 1
    },
    "by_repo": {
      "org-wide": 10,
      "core-product": 12,
      "github-workflows": 7,
      "unstructured": 3,
      "platform-libs": 5,
      "unstructured-inference": 2
    }
  }
}