How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

def parse_click_coords(action_str):
“””
Extract normalised (x, y) coordinates from a click action string.
e.g., ‘click(0.45, 0.32)’ -> (0.45, 0.32)
Returns None if the action is not a click.
“””
match = re.search(r”click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)”, action_str)
if match:
return float(match.group(1)), float(match.group(2))
return None

def parse_action_details(action_str):
“””
Parse a MolmoWeb action string into a structured dict.
Returns: {“type”: “click”, “x”: 0.45, “y”: 0.32}
{“type”: “goto”, “url”: “https://…”}
{“type”: “type”, “text”: “query text”}
{“type”: “scroll”, “direction”: “down”}
{“type”: “press”, “key”: “Enter”}
{“type”: “send_msg”, “message”: “The answer is …”}
{“type”: “unknown”, “raw”: “…”}
“””
action_str = action_str.strip()

m = re.match(r’click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)’, action_str)
if m:
return {“type”: “click”, “x”: float(m.group(1)), “y”: float(m.group(2))}

m = re.match(r’goto\(\s*[“\’](.+?)[“\’]\s*\)’, action_str)
if m:
return {“type”: “goto”, “url”: m.group(1)}

m = re.match(r’type\(\s*[“\’](.+?)[“\’]\s*\)’, action_str)
if m:
return {“type”: “type”, “text”: m.group(1)}

m = re.match(r’scroll\(\s*[“\’]?(up|down)[“\’]?\s*\)’, action_str)
if m:
return {“type”: “scroll”, “direction”: m.group(1)}

m = re.match(r’press\(\s*[“\’](.+?)[“\’]\s*\)’, action_str)
if m:
return {“type”: “press”, “key”: m.group(1)}

m = re.match(r’send_msg\(\s*[“\’](.+?)[“\’]\s*\)’, action_str, re.DOTALL)
if m:
return {“type”: “send_msg”, “message”: m.group(1)}

m = re.match(r'(new_tab|go_back|switch_tab)\(\s*(\d*)\s*\)’, action_str)
if m:
result = {“type”: m.group(1)}
if m.group(2):
result[“tab”] = int(m.group(2))
return result

return {“type”: “unknown”, “raw”: action_str}

def visualise_click(image, action_str, title=”MolmoWeb Prediction”):
“””
Draw the predicted click location on the screenshot and display it.
Coordinates are normalised (0-1); we convert to pixel space.
“””
coords = parse_click_coords(action_str)

fig, ax = plt.subplots(1, 1, figsize=(12, 7))
ax.imshow(image)
ax.set_title(title, fontsize=14)

if coords:
x_norm, y_norm = coords
w, h = image.size
x_px, y_px = x_norm * w, y_norm * h

circle = patches.Circle(
(x_px, y_px), radius=18, linewidth=3,
edgecolor=”red”, facecolor=”none”
)
ax.add_patch(circle)
ax.plot(x_px, y_px, “r+”, markersize=20, markeredgewidth=3)

ax.annotate(
f”click({x_norm:.3f}, {y_norm:.3f})”,
(x_px, y_px), xytext=(x_px + 25, y_px – 25),
fontsize=11, color=”white”,
bbox=dict(boxstyle=”round,pad=0.3″, facecolor=”red”, alpha=0.8),
arrowprops=dict(arrowstyle=”->”, color=”red”, lw=2),
)
else:
ax.text(
0.5, 0.02, f”Action: {action_str}”, transform=ax.transAxes,
fontsize=12, ha=”center”, color=”white”,
bbox=dict(boxstyle=”round,pad=0.4″, facecolor=”blue”, alpha=0.8),
)

ax.axis(“off”)
plt.tight_layout()
plt.show()

def download_image(url, size=(1280, 720)):
“””Download an image from a URL and resize to browser viewport dimensions.”””
response = requests.get(url, timeout=15)
img = Image.open(BytesIO(response.content)).convert(“RGB”)
img = img.resize(size, Image.LANCZOS)
return img

def create_synthetic_webpage(title=”Example Page”, elements=None):
“””
Create a synthetic webpage screenshot for testing.
‘elements’ is a list of dicts: {“type”: “button”|”input”|”text”|”link”,
“text”: str, “pos”: (x, y)}
“””
img = Image.new(“RGB”, (1280, 720), color=(255, 255, 255))
draw = ImageDraw.Draw(img)

draw.rectangle([0, 0, 1280, 50], fill=(240, 240, 240))
draw.rectangle([180, 10, 900, 40], outline=(200, 200, 200), width=1, fill=”white”)
draw.text((200, 16), f”https://www.example.com”, fill=(100, 100, 100))

for cx in [30, 60, 90]:
draw.ellipse([cx – 8, 17, cx + 8, 33], fill=(200, 200, 200))

draw.text((50, 70), title, fill=”black”)

if elements:
for el in elements:
x, y = el[“pos”]
if el[“type”] == “button”:
draw.rectangle([x, y, x + 150, y + 35], fill=(66, 133, 244))
draw.text((x + 10, y + 8), el[“text”], fill=”white”)
elif el[“type”] == “input”:
draw.rectangle([x, y, x + 300, y + 35], outline=(180, 180, 180), width=2)
draw.text((x + 10, y + 8), el[“text”], fill=(150, 150, 150))
elif el[“type”] == “text”:
draw.text((x, y), el[“text”], fill=”black”)
elif el[“type”] == “link”:
draw.text((x, y), el[“text”], fill=(66, 133, 244))

return img

print(“Helper functions defined successfully.”)

print(“\n” + “=” * 70)
print(“SECTION 5: Single-step inference – blank page (cold start)”)
print(“=” * 70)
print(“The agent starts at about:blank and must decide its first action.\n”)

blank_image = Image.new(“RGB”, (1280, 720), color=”white”)

task = “Go to arxiv.org and find the latest paper about Molmo from Ai2″

prompt = build_prompt(
task_description=task,
page_url=”about:blank”,
page_index=0,
)

print(f”Task: {task}”)
print(“Screenshot: blank white image (about:blank)”)
print(“Running inference…\n”)

raw_output = run_inference(prompt, blank_image)

print(f”Raw model output:\n{raw_output}\n”)

parsed = parse_thought_and_action(raw_output)
print(f”Thought: {parsed[‘thought’]}”)
print(f”Action: {parsed[‘action’]}”)

action_details = parse_action_details(parsed[“action”])
print(f”Parsed: {action_details}”)

Source link

How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

You may also like