codeflash/code_to_optimize/roboflow_original.py
2025-04-29 10:57:50 -07:00

653 lines
No EOL
22 KiB
Python

import base64
import json
from functools import partial
from typing import Any, Dict, List, Literal, Optional, Type, Union
import requests
from openai import OpenAI
from openai._types import NOT_GIVEN
from pydantic import ConfigDict, Field, model_validator
from inference.core.env import WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS, API_BASE_URL
from inference.core.managers.base import ModelManager
from inference.core.utils.image_utils import encode_image_to_jpeg_bytes, load_image
from inference.core.workflows.core_steps.common.utils import run_in_parallel
from inference.core.workflows.core_steps.common.vlms import VLM_TASKS_METADATA
from inference.core.workflows.execution_engine.entities.base import (
Batch,
OutputDefinition,
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
FLOAT_KIND,
IMAGE_KIND,
LANGUAGE_MODEL_OUTPUT_KIND,
LIST_OF_VALUES_KIND,
SECRET_KIND,
STRING_KIND,
ImageInputField,
Selector,
)
from inference.core.workflows.prototypes.block import (
BlockResult,
WorkflowBlock,
WorkflowBlockManifest,
)
SUPPORTED_TASK_TYPES_LIST = [
"unconstrained",
"ocr",
"structured-answering",
"classification",
"multi-label-classification",
"visual-question-answering",
"caption",
"detailed-caption",
]
SUPPORTED_TASK_TYPES = set(SUPPORTED_TASK_TYPES_LIST)
RELEVANT_TASKS_METADATA = {
k: v for k, v in VLM_TASKS_METADATA.items() if k in SUPPORTED_TASK_TYPES
}
RELEVANT_TASKS_DOCS_DESCRIPTION = "\n\n".join(
f"* **{v['name']}** (`{k}`) - {v['description']}"
for k, v in RELEVANT_TASKS_METADATA.items()
)
LONG_DESCRIPTION = f"""
Ask a question to OpenAI's GPT-4 with Vision model.
You can specify arbitrary text prompts or predefined ones, the block supports the following types of prompt:
{RELEVANT_TASKS_DOCS_DESCRIPTION}
You need to provide your OpenAI API key to use the GPT-4 with Vision model.
"""
TaskType = Literal[tuple(SUPPORTED_TASK_TYPES_LIST)]
TASKS_REQUIRING_PROMPT = {
"unconstrained",
"visual-question-answering",
}
TASKS_REQUIRING_CLASSES = {
"classification",
"multi-label-classification",
}
TASKS_REQUIRING_OUTPUT_STRUCTURE = {
"structured-answering",
}
class BlockManifest(WorkflowBlockManifest):
model_config = ConfigDict(
json_schema_extra={
"name": "OpenAI",
"version": "v3",
"short_description": "Run OpenAI's GPT-4 with vision capabilities.",
"long_description": LONG_DESCRIPTION,
"license": "Apache-2.0",
"block_type": "model",
"search_keywords": ["LMM", "VLM", "ChatGPT", "GPT", "OpenAI"],
"is_vlm_block": True,
"task_type_property": "task_type",
"ui_manifest": {
"section": "model",
"icon": "fal fa-atom",
"blockPriority": 5,
"popular": True,
},
},
protected_namespaces=(),
)
type: Literal["roboflow_core/open_ai@v3"]
images: Selector(kind=[IMAGE_KIND]) = ImageInputField
task_type: TaskType = Field(
default="unconstrained",
description="Task type to be performed by model. Value determines required parameters and output response.",
json_schema_extra={
"values_metadata": RELEVANT_TASKS_METADATA,
"recommended_parsers": {
"structured-answering": "roboflow_core/json_parser@v1",
"classification": "roboflow_core/vlm_as_classifier@v1",
"multi-label-classification": "roboflow_core/vlm_as_classifier@v1",
},
"always_visible": True,
},
)
prompt: Optional[Union[Selector(kind=[STRING_KIND]), str]] = Field(
default=None,
description="Text prompt to the OpenAI model",
examples=["my prompt", "$inputs.prompt"],
json_schema_extra={
"relevant_for": {
"task_type": {"values": TASKS_REQUIRING_PROMPT, "required": True},
},
"multiline": True,
},
)
output_structure: Optional[Dict[str, str]] = Field(
default=None,
description="Dictionary with structure of expected JSON response",
examples=[{"my_key": "description"}, "$inputs.output_structure"],
json_schema_extra={
"relevant_for": {
"task_type": {
"values": TASKS_REQUIRING_OUTPUT_STRUCTURE,
"required": True,
},
},
},
)
classes: Optional[Union[Selector(kind=[LIST_OF_VALUES_KIND]), List[str]]] = Field(
default=None,
description="List of classes to be used",
examples=[["class-a", "class-b"], "$inputs.classes"],
json_schema_extra={
"relevant_for": {
"task_type": {
"values": TASKS_REQUIRING_CLASSES,
"required": True,
},
},
},
)
api_key: Union[Selector(kind=[STRING_KIND, SECRET_KIND]), str] = Field(
description="Your OpenAI API key",
examples=["xxx-xxx", "$inputs.openai_api_key"],
private=True,
)
model_version: Union[
Selector(kind=[STRING_KIND]), Literal["gpt-4o", "gpt-4o-mini"]
] = Field(
default="gpt-4o",
description="Model to be used",
examples=["gpt-4o", "$inputs.openai_model"],
)
image_detail: Union[
Selector(kind=[STRING_KIND]), Literal["auto", "high", "low"]
] = Field(
default="auto",
description="Indicates the image's quality, with 'high' suggesting it is of high resolution and should be processed or displayed with high fidelity.",
examples=["auto", "high", "low"],
)
max_tokens: int = Field(
default=450,
description="Maximum number of tokens the model can generate in it's response.",
)
temperature: Optional[Union[float, Selector(kind=[FLOAT_KIND])]] = Field(
default=None,
description="Temperature to sample from the model - value in range 0.0-2.0, the higher - the more "
'random / "creative" the generations are.',
ge=0.0,
le=2.0,
)
max_concurrent_requests: Optional[int] = Field(
default=None,
description="Number of concurrent requests that can be executed by block when batch of input images provided. "
"If not given - block defaults to value configured globally in Workflows Execution Engine. "
"Please restrict if you hit OpenAI limits.",
)
@model_validator(mode="after")
def validate(self) -> "BlockManifest":
if self.task_type in TASKS_REQUIRING_PROMPT and self.prompt is None:
raise ValueError(
f"`prompt` parameter required to be set for task `{self.task_type}`"
)
if self.task_type in TASKS_REQUIRING_CLASSES and self.classes is None:
raise ValueError(
f"`classes` parameter required to be set for task `{self.task_type}`"
)
if (
self.task_type in TASKS_REQUIRING_OUTPUT_STRUCTURE
and self.output_structure is None
):
raise ValueError(
f"`output_structure` parameter required to be set for task `{self.task_type}`"
)
return self
@classmethod
def get_parameters_accepting_batches(cls) -> List[str]:
return ["images"]
@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [
OutputDefinition(
name="output", kind=[STRING_KIND, LANGUAGE_MODEL_OUTPUT_KIND]
),
OutputDefinition(name="classes", kind=[LIST_OF_VALUES_KIND]),
]
@classmethod
def get_execution_engine_compatibility(cls) -> Optional[str]:
return ">=1.4.0,<2.0.0"
class OpenAIBlockV3(WorkflowBlock):
def __init__(
self,
model_manager: ModelManager,
api_key: Optional[str],
):
self._model_manager = model_manager
self._api_key = api_key
@classmethod
def get_init_parameters(cls) -> List[str]:
return ["model_manager", "api_key"]
@classmethod
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
return BlockManifest
@classmethod
def get_execution_engine_compatibility(cls) -> Optional[str]:
return ">=1.3.0,<2.0.0"
def run(
self,
images: Batch[WorkflowImageData],
task_type: TaskType,
prompt: Optional[str],
output_structure: Optional[Dict[str, str]],
classes: Optional[List[str]],
api_key: str,
model_version: str,
image_detail: Literal["low", "high", "auto"],
max_tokens: int,
temperature: Optional[float],
max_concurrent_requests: Optional[int],
) -> BlockResult:
inference_images = [i.to_inference_format() for i in images]
raw_outputs = run_gpt_4v_llm_prompting(
roboflow_api_key=self._api_key,
images=inference_images,
task_type=task_type,
prompt=prompt,
output_structure=output_structure,
classes=classes,
openai_api_key=api_key,
gpt_model_version=model_version,
gpt_image_detail=image_detail,
max_tokens=max_tokens,
temperature=temperature,
max_concurrent_requests=max_concurrent_requests,
)
return [
{"output": raw_output, "classes": classes} for raw_output in raw_outputs
]
def run_gpt_4v_llm_prompting(
images: List[Dict[str, Any]],
task_type: TaskType,
prompt: Optional[str],
output_structure: Optional[Dict[str, str]],
classes: Optional[List[str]],
roboflow_api_key: Optional[str],
openai_api_key: Optional[str],
gpt_model_version: str,
gpt_image_detail: Literal["auto", "high", "low"],
max_tokens: int,
temperature: Optional[int],
max_concurrent_requests: Optional[int],
) -> List[str]:
if task_type not in PROMPT_BUILDERS:
raise ValueError(f"Task type: {task_type} not supported.")
gpt4_prompts = []
for image in images:
loaded_image, _ = load_image(image)
base64_image = base64.b64encode(
encode_image_to_jpeg_bytes(loaded_image)
).decode("ascii")
generated_prompt = PROMPT_BUILDERS[task_type](
base64_image=base64_image,
prompt=prompt,
output_structure=output_structure,
classes=classes,
gpt_image_detail=gpt_image_detail,
)
gpt4_prompts.append(generated_prompt)
return execute_gpt_4v_requests(
roboflow_api_key=roboflow_api_key,
openai_api_key=openai_api_key,
gpt4_prompts=gpt4_prompts,
gpt_model_version=gpt_model_version,
max_tokens=max_tokens,
temperature=temperature,
max_concurrent_requests=max_concurrent_requests,
)
def execute_gpt_4v_requests(
roboflow_api_key:str,
openai_api_key: str,
gpt4_prompts: List[List[dict]],
gpt_model_version: str,
max_tokens: int,
temperature: Optional[float],
max_concurrent_requests: Optional[int],
) -> List[str]:
tasks = [
partial(
execute_gpt_4v_request,
roboflow_api_key=roboflow_api_key,
openai_api_key=openai_api_key,
prompt=prompt,
gpt_model_version=gpt_model_version,
max_tokens=max_tokens,
temperature=temperature,
)
for prompt in gpt4_prompts
]
max_workers = (
max_concurrent_requests
or WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS
)
return run_in_parallel(
tasks=tasks,
max_workers=max_workers,
)
def _execute_proxied_openai_request(
roboflow_api_key: str,
openai_api_key: str,
prompt: List[dict],
gpt_model_version: str,
max_tokens: int,
temperature: Optional[float],
) -> str:
"""Executes OpenAI request via Roboflow proxy."""
payload = {
"model": gpt_model_version,
"messages": prompt,
"max_tokens": max_tokens,
"openai_api_key": openai_api_key,
}
if temperature is not None:
payload["temperature"] = temperature
try:
endpoint = f"{API_BASE_URL}/apiproxy/openai?api_key={roboflow_api_key}"
response = requests.post(endpoint, json=payload)
response.raise_for_status()
response_data = response.json()
return response_data["choices"][0]["message"]["content"]
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Failed to connect to Roboflow proxy: {e}") from e
except (KeyError, IndexError) as e:
raise RuntimeError(
f"Invalid response structure from Roboflow proxy: {e} - Response: {response.text}"
) from e
def _execute_openai_request(
openai_api_key: str,
prompt: List[dict],
gpt_model_version: str,
max_tokens: int,
temperature: Optional[float],
) -> str:
"""Executes OpenAI request directly."""
temp_value = temperature if temperature is not None else NOT_GIVEN
try:
client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
model=gpt_model_version,
messages=prompt,
max_tokens=max_tokens,
temperature=temp_value,
)
return response.choices[0].message.content
except Exception as e:
raise RuntimeError(f"OpenAI API request failed: {e}") from e
def execute_gpt_4v_request(
roboflow_api_key: str,
openai_api_key: str,
prompt: List[dict],
gpt_model_version: str,
max_tokens: int,
temperature: Optional[float],
) -> str:
if openai_api_key.startswith("rf_key:account") or openai_api_key.startswith(
"rf_key:user:"
):
return _execute_proxied_openai_request(
roboflow_api_key=roboflow_api_key,
openai_api_key=openai_api_key,
prompt=prompt,
gpt_model_version=gpt_model_version,
max_tokens=max_tokens,
temperature=temperature,
)
else:
return _execute_openai_request(
openai_api_key=openai_api_key,
prompt=prompt,
gpt_model_version=gpt_model_version,
max_tokens=max_tokens,
temperature=temperature,
)
def prepare_unconstrained_prompt(
base64_image: str,
prompt: str,
gpt_image_detail: str,
**kwargs,
) -> List[dict]:
return [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
}
]
def prepare_classification_prompt(
base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
) -> List[dict]:
serialised_classes = ", ".join(classes)
return [
{
"role": "system",
"content": "You act as single-class classification model. You must provide reasonable predictions. "
"You are only allowed to produce JSON document in Markdown ```json [...]``` markers. "
'Expected structure of json: {"class_name": "class-name", "confidence": 0.4}. '
"`class-name` must be one of the class names defined by user. You are only allowed to return "
"single JSON document, even if there are potentially multiple classes. You are not allowed to return list.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"List of all classes to be recognised by model: {serialised_classes}",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
def prepare_multi_label_classification_prompt(
base64_image: str, classes: List[str], gpt_image_detail: str, **kwargs
) -> List[dict]:
serialised_classes = ", ".join(classes)
return [
{
"role": "system",
"content": "You act as multi-label classification model. You must provide reasonable predictions. "
"You are only allowed to produce JSON document in Markdown ```json``` markers. "
'Expected structure of json: {"predicted_classes": [{"class": "class-name-1", "confidence": 0.9}, '
'{"class": "class-name-2", "confidence": 0.7}]}. '
"`class-name-X` must be one of the class names defined by user and `confidence` is a float value in range "
"0.0-1.0 that represent how sure you are that the class is present in the image. Only return class names "
"that are visible.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"List of all classes to be recognised by model: {serialised_classes}",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
def prepare_vqa_prompt(
base64_image: str, prompt: str, gpt_image_detail: str, **kwargs
) -> List[dict]:
return [
{
"role": "system",
"content": "You act as Visual Question Answering model. Your task is to provide answer to question"
"submitted by user. If this is open-question - answer with few sentences, for ABCD question, "
"return only the indicator of the answer.",
},
{
"role": "user",
"content": [
{"type": "text", "text": f"Question: {prompt}"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
def prepare_ocr_prompt(
base64_image: str, gpt_image_detail: str, **kwargs
) -> List[dict]:
return [
{
"role": "system",
"content": "You act as OCR model. Your task is to read text from the image and return it in "
"paragraphs representing the structure of texts in the image. You should only return "
"recognised text, nothing else.",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
def prepare_caption_prompt(
base64_image: str, gpt_image_detail: str, short_description: bool, **kwargs
) -> List[dict]:
caption_detail_level = "Caption should be short."
if not short_description:
caption_detail_level = "Caption should be extensive."
return [
{
"role": "system",
"content": f"You act as image caption model. Your task is to provide description of the image. "
f"{caption_detail_level}",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
def prepare_structured_answering_prompt(
base64_image: str, output_structure: Dict[str, str], gpt_image_detail: str, **kwargs
) -> List[dict]:
output_structure_serialised = json.dumps(output_structure, indent=4)
return [
{
"role": "system",
"content": "You are supposed to produce responses in JSON wrapped in Markdown markers: "
"```json\nyour-response\n```. User is to provide you dictionary with keys and values. "
"Each key must be present in your response. Values in user dictionary represent "
"descriptions for JSON fields to be generated. Provide only JSON Markdown in response.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Specification of requirements regarding output fields: \n"
f"{output_structure_serialised}",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": gpt_image_detail,
},
},
],
},
]
PROMPT_BUILDERS = {
"unconstrained": prepare_unconstrained_prompt,
"ocr": prepare_ocr_prompt,
"visual-question-answering": prepare_vqa_prompt,
"caption": partial(prepare_caption_prompt, short_description=True),
"detailed-caption": partial(prepare_caption_prompt, short_description=False),
"classification": prepare_classification_prompt,
"multi-label-classification": prepare_multi_label_classification_prompt,
"structured-answering": prepare_structured_answering_prompt,
}