Computer-Use Agent Loop

Guides

Build a multi-turn agent loop using the Responses API where you control every step.

A computer-use agent (CUA) loop is a pattern where you repeatedly send screenshots to a vision model, it decides the next action, and you execute it. This guide shows how to build a complete CUA loop using the Responses API.

Prerequisites: Understand Computers (sessions and actions) and the Responses API (request/response format).

The full loop

from tzafon import Lightcone

client = Lightcone()

# 1. Create a browser session
with client.computer.create(kind="browser") as computer:
    # 2. Take an initial screenshot
    screenshot = computer.screenshot()
    screenshot_url = computer.get_screenshot_url(screenshot)

    # 3. Send the first request — use array input to include the screenshot
    response = client.responses.create(
        model="tzafon.northstar-cua-fast",
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": "Search Wikipedia for 'Alan Turing'"},
                    {"type": "input_image", "image_url": screenshot_url},
                ],
            },
        ],
        tools=[{
            "type": "computer_use",
            "display_width": 1280,
            "display_height": 720,
            "environment": "browser",
        }],
    )

    # 4. Loop until the model stops requesting actions
    while True:
        # Find computer_call items in the output
        computer_call = None
        for item in response.output or []:
            if item.type == "computer_call":
                computer_call = item
            elif item.type == "message":
                for block in item.content or []:
                    print(f"Agent says: {block.text}")

        if not computer_call:
            print("Agent is done.")
            break

        # 5. Execute the action
        action = computer_call.action
        print(f"Executing: {action.type}")

        if action.type in ("click", "double_click", "triple_click", "right_click"):
            computer.click(action.x, action.y)
        elif action.type == "type":
            computer.type(action.text)
        elif action.type in ("key", "keypress"):
            computer.hotkey(*action.keys)
        elif action.type == "scroll":
            computer.scroll(action.scroll_x or 0, action.scroll_y or 0, action.x or 640, action.y or 400)
        elif action.type == "navigate":
            computer.navigate(action.url)
        elif action.type == "wait":
            computer.wait(2)
        elif action.type in ("terminate", "done", "answer"):
            print(f"Result: {action.result or action.text}")
            break

        computer.wait(1)

        # 6. Take a new screenshot and feed it back
        screenshot = computer.screenshot()
        screenshot_url = computer.get_screenshot_url(screenshot)

        response = client.responses.create(
            model="tzafon.northstar-cua-fast",
            previous_response_id=response.id,
            input=[{
                "type": "computer_call_output",
                "call_id": computer_call.call_id,
                "output": {"type": "input_image", "image_url": screenshot_url},
            }],
            tools=[{
                "type": "computer_use",
                "display_width": 1280,
                "display_height": 720,
                "environment": "browser",
            }],
        )

import Lightcone from "@tzafon/lightcone";

const client = new Lightcone();
const computer = await client.computers.create({ kind: "browser" });
const id = computer.id!;

try {
  // Take initial screenshot
  const initialScreenshot = await client.computers.screenshot(id);
  const screenshotUrl = initialScreenshot.result?.screenshot_url as string;

  // First request — use array input to include the screenshot
  let response = await client.responses.create({
    model: "tzafon.northstar-cua-fast",
    input: [
      {
        role: "user",
        content: [
          { type: "input_text", text: "Search Wikipedia for 'Alan Turing'" },
          { type: "input_image", image_url: screenshotUrl },
        ],
      },
    ],
    tools: [{
      type: "computer_use",
      display_width: 1280,
      display_height: 720,
      environment: "browser",
    }],
  });

  // Loop until done
  while (true) {
    const computerCall = response.output?.find((item) => item.type === "computer_call");
    const message = response.output?.find((item) => item.type === "message");

    if (message) {
      for (const block of message.content ?? []) {
        console.log(`Agent says: ${block.text}`);
      }
    }

    if (!computerCall) {
      console.log("Agent is done.");
      break;
    }

    const action = computerCall.action!;
    console.log(`Executing: ${action.type}`);

    // Execute the action
    switch (action.type) {
      case "click":
      case "double_click":
      case "triple_click":
      case "right_click":
        await client.computers.click(id, { x: action.x!, y: action.y! });
        break;
      case "type":
        await client.computers.type(id, { text: action.text! });
        break;
      case "key":
      case "keypress":
        await client.computers.hotkey(id, { keys: action.keys! });
        break;
      case "scroll":
        await client.computers.scroll(id, {
          dx: action.scroll_x ?? 0, dy: action.scroll_y ?? 0,
          x: action.x ?? 640, y: action.y ?? 400,
        });
        break;
      case "navigate":
        await client.computers.navigate(id, { url: action.url! });
        break;
      case "terminate":
      case "done":
      case "answer":
        console.log(`Result: ${action.result ?? action.text}`);
        break;
    }

    // Screenshot and feed back
    await new Promise((r) => setTimeout(r, 1000));
    const newScreenshot = await client.computers.screenshot(id);
    const newUrl = newScreenshot.result?.screenshot_url as string;

    response = await client.responses.create({
      model: "tzafon.northstar-cua-fast",
      previous_response_id: response.id!,
      input: [{
        type: "computer_call_output",
        call_id: computerCall.call_id!,
        output: { type: "input_image", image_url: newUrl },
      }],
      tools: [{
        type: "computer_use",
        display_width: 1280,
        display_height: 720,
        environment: "browser",
      }],
    });
  }
} finally {
  await client.computers.delete(id);
}

How it works

┌─────────────┐     instruction + screenshot     ┌─────────────┐
│             │ ──────────────────────────────── > │             │
│  Your Code  │                                   │    Model    │
│             │ < ──────────────────────────────── │             │
└─────────────┘     computer_call (action)        └─────────────┘
       │                                                 ^
       │  execute action                                 │
       v                                                 │
┌─────────────┐                                          │
│  Computer   │     screenshot of new state              │
│  Session    │ ─────────────────────────────────────────>│
└─────────────┘   (as computer_call_output)

Each iteration:

The model looks at the screenshot and decides the next action
You execute the action on the computer session
You take a screenshot of the result
You send it back as computer_call_output with previous_response_id
Repeat until the model sends a message or done action

When to use this vs. Agent Tasks

	CUA Loop (Responses API)	Agent Tasks
Control	You control every step	Fully autonomous
Customization	Add custom logic between steps	Limited to instructions
Observability	Full visibility into every action	Stream events
Complexity	More code to write	One API call
Best for	Custom workflows, hybrid agents	Simple end-to-end tasks

Computer-Use Agent Loop

The full loop

How it works

When to use this vs. Agent Tasks

See also

What can I help you with?

Suggestions