Skip to content

Commit cf1912d

Browse files
committed
Some transformer bench ideas
1 parent 26c6281 commit cf1912d

File tree

2 files changed

+173
-13
lines changed

2 files changed

+173
-13
lines changed

olmocr/bench/runners/run_transformers.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,21 @@
55

66
import torch
77
from PIL import Image
8-
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
8+
from transformers import (
9+
AutoProcessor,
10+
Qwen2VLForConditionalGeneration,
11+
Qwen2_5_VLForConditionalGeneration,
12+
)
913

1014
from olmocr.data.renderpdf import render_pdf_to_base64png
1115
from olmocr.prompts.anchor import get_anchor_text
1216
from olmocr.prompts.prompts import (
1317
PageResponse,
1418
build_finetuning_prompt,
1519
build_openai_silver_data_prompt,
20+
build_no_anchoring_yaml_prompt,
1621
)
22+
from olmocr.train.dataloader import FrontMatterParser
1723

1824
_cached_model = None
1925
_cached_processor = None
@@ -22,11 +28,11 @@
2228
def run_transformers(
2329
pdf_path: str,
2430
page_num: int = 1,
25-
model: str = "allenai/olmOCR-7B-0225-preview",
31+
model_name: str = "allenai/olmOCR-7B-0725-FP8",
2632
temperature: float = 0.1,
2733
target_longest_image_dim: int = 1024,
28-
prompt_template: Literal["full", "finetune"] = "finetune",
29-
response_template: Literal["plain", "json"] = "json",
34+
prompt_template: Literal["full", "finetune", "yaml"] = "yaml",
35+
response_template: Literal["plain", "json", "yaml"] = "yaml",
3036
) -> str:
3137
"""
3238
Convert page of a PDF file to markdown by calling a request
@@ -45,8 +51,9 @@ def run_transformers(
4551
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
4652

4753
if _cached_model is None:
48-
model = Qwen2VLForConditionalGeneration.from_pretrained(model, torch_dtype=torch.bfloat16).eval()
49-
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
54+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
55+
processor = AutoProcessor.from_pretrained(model_name)
56+
5057
model = model.to(device)
5158

5259
_cached_model = model
@@ -57,19 +64,22 @@ def run_transformers(
5764

5865
# Convert the first page of the PDF to a base64-encoded PNG image.
5966
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
60-
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
61-
62-
if prompt_template == "full":
63-
prompt = build_openai_silver_data_prompt(anchor_text)
67+
68+
if prompt_template == "yaml":
69+
prompt = build_no_anchoring_yaml_prompt()
6470
else:
65-
prompt = build_finetuning_prompt(anchor_text)
71+
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
72+
if prompt_template == "full":
73+
prompt = build_openai_silver_data_prompt(anchor_text)
74+
else:
75+
prompt = build_finetuning_prompt(anchor_text)
6676

6777
messages = [
6878
{
6979
"role": "user",
7080
"content": [
71-
{"type": "text", "text": prompt},
7281
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
82+
{"type": "text", "text": prompt},
7383
],
7484
}
7585
]
@@ -107,6 +117,12 @@ def run_transformers(
107117
if response_template == "json":
108118
page_data = json.loads(text_output)
109119
page_response = PageResponse(**page_data)
110-
return page_response.natural_text
120+
return page_response.natural_text if page_response.natural_text else ""
121+
elif response_template == "yaml":
122+
# Parse YAML front matter and extract natural text
123+
parser = FrontMatterParser(front_matter_class=PageResponse)
124+
front_matter, text = parser._extract_front_matter_and_text(text_output)
125+
page_response = parser._parse_front_matter(front_matter, text)
126+
return page_response.natural_text if page_response.natural_text else ""
111127
elif response_template == "plain":
112128
return text_output

scripts/run_transformers_benchmark.sh

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/bin/bash
2+
3+
# Runs transformers benchmark in olmocr-bench
4+
set -e
5+
6+
7+
# Check for uncommitted changes
8+
if ! git diff-index --quiet HEAD --; then
9+
echo "Error: There are uncommitted changes in the repository."
10+
echo "Please commit or stash your changes before running the benchmark."
11+
echo ""
12+
echo "Uncommitted changes:"
13+
git status --short
14+
exit 1
15+
fi
16+
17+
# Use conda environment Python if available, otherwise use system Python
18+
if [ -n "$CONDA_PREFIX" ]; then
19+
PYTHON="$CONDA_PREFIX/bin/python"
20+
echo "Using conda Python from: $CONDA_PREFIX"
21+
else
22+
PYTHON="python"
23+
echo "Warning: No conda environment detected, using system Python"
24+
fi
25+
26+
# Get version from version.py
27+
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
28+
echo "OlmOCR version: $VERSION"
29+
30+
# Get first 10 characters of git hash
31+
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
32+
echo "Git hash: $GIT_HASH"
33+
34+
# Get current git branch name
35+
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
36+
echo "Git branch: $GIT_BRANCH"
37+
38+
# Create full image tag
39+
IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
40+
echo "Building Docker image with tag: $IMAGE_TAG"
41+
42+
# Build the Docker image
43+
echo "Building Docker image..."
44+
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
45+
46+
# Get Beaker username
47+
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
48+
echo "Beaker user: $BEAKER_USER"
49+
50+
# Push image to beaker
51+
echo "Trying to push image to Beaker..."
52+
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
53+
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
54+
fi
55+
56+
# Create Python script to run beaker experiment
57+
cat << 'EOF' > /tmp/run_benchmark_experiment.py
58+
import sys
59+
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
60+
61+
# Get image tag, beaker user, git branch, git hash, version from command line
62+
image_tag = sys.argv[1]
63+
beaker_user = sys.argv[2]
64+
git_branch = sys.argv[3]
65+
git_hash = sys.argv[4]
66+
67+
68+
# Initialize Beaker client
69+
b = Beaker.from_env(default_workspace="ai2/olmocr")
70+
71+
72+
# Check if AWS credentials secret exists
73+
aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
74+
try:
75+
# Try to get the secret to see if it exists
76+
b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
77+
has_aws_creds = True
78+
print(f"Found AWS credentials secret: {aws_creds_secret}")
79+
except:
80+
has_aws_creds = False
81+
print(f"AWS credentials secret not found: {aws_creds_secret}")
82+
83+
# First experiment: Original benchmark job
84+
commands = []
85+
if has_aws_creds:
86+
commands.extend([
87+
"mkdir -p ~/.aws",
88+
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
89+
])
90+
commands.extend([
91+
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
92+
"cd olmOCR-bench && git lfs pull && cd ..",
93+
"python -m olmocr.bench.convert transformers:target_longest_image_dim=1288:prompt_template=yaml:response_template=yaml: --dir ./olmOCR-bench/bench_data",
94+
"python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
95+
])
96+
97+
# Build task spec with optional env vars
98+
task_spec_args = {
99+
"name": "transformers-benchmark",
100+
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
101+
"command": [
102+
"bash", "-c",
103+
" && ".join(commands)
104+
],
105+
"context": TaskContext(
106+
priority=Priority.normal,
107+
preemptible=True,
108+
),
109+
"resources": TaskResources(gpu_count=1),
110+
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
111+
"result": ResultSpec(path="/noop-results"),
112+
}
113+
114+
# Add env vars if AWS credentials exist
115+
if has_aws_creds:
116+
task_spec_args["env_vars"] = [
117+
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
118+
]
119+
120+
# Create first experiment spec
121+
experiment_spec = ExperimentSpec(
122+
description=f"Transformers Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
123+
budget="ai2/oe-data",
124+
tasks=[TaskSpec(**task_spec_args)],
125+
)
126+
127+
# Create the first experiment
128+
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
129+
print(f"Created benchmark experiment: {experiment.id}")
130+
print(f"View at: https://beaker.org/ex/{experiment.id}")
131+
print("-------")
132+
print("")
133+
134+
135+
EOF
136+
137+
# Run the Python script to create the experiments
138+
echo "Creating Beaker experiments..."
139+
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
140+
141+
# Clean up temporary file
142+
rm /tmp/run_benchmark_experiment.py
143+
144+
echo "Benchmark experiments submitted successfully!"

0 commit comments

Comments
 (0)