Web Scraping

Use Cases

Extract structured data from any website, even behind logins and bot detection.

Use Lightcone to scrape websites that block simple HTTP requests. Lightcone’s stealth browsers render JavaScript, handle bot detection, and support residential proxies — so you can extract data from sites that tools like requests or fetch can’t reach.

Scrape a product listing page

from tzafon import Lightcone
import json

client = Lightcone()

with client.computer.create(
    kind="browser",
    use_advanced_proxy=True,
) as computer:
    # Navigate to the target page
    computer.navigate("https://books.toscrape.com/catalogue/category/books/science_22/index.html")
    computer.wait(2)

    # Extract the page HTML
    html_result = computer.html()
    html_content = computer.get_html_content(html_result)

    # Take a screenshot for debugging
    result = computer.screenshot()
    print(f"Screenshot: {computer.get_screenshot_url(result)}")
    print(f"HTML length: {len(html_content)} chars")

import Lightcone from "@tzafon/lightcone";

const client = new Lightcone();
const computer = await client.computers.create({
  kind: "browser",
  use_advanced_proxy: true,
});
const id = computer.id!;

try {
  await client.computers.navigate(id, {
    url: "https://books.toscrape.com/catalogue/category/books/science_22/index.html",
  });

  const htmlResult = await client.computers.html(id);
  console.log("HTML retrieved");

  const screenshot = await client.computers.screenshot(id);
  console.log("Screenshot:", screenshot.result?.screenshot_url);
} finally {
  await client.computers.delete(id);
}

Scrape paginated results

Handle pagination by clicking “Next” and collecting data across multiple pages:

with client.computer.create(kind="browser", use_advanced_proxy=True) as computer:
    computer.navigate("https://books.toscrape.com")
    computer.wait(2)

    all_pages_html = []

    for page_num in range(1, 4):  # Scrape first 3 pages
        html_result = computer.html()
        all_pages_html.append(computer.get_html_content(html_result))
        print(f"Scraped page {page_num}")

        # Click the "next" button
        try:
            computer.click(720, 780)  # Coordinates of the "next" button
            computer.wait(2)
        except Exception:
            print("No more pages")
            break

    print(f"Total pages scraped: {len(all_pages_html)}")

const computer = await client.computers.create({
  kind: "browser",
  use_advanced_proxy: true,
});
const id = computer.id!;
const allPagesHtml: string[] = [];

try {
  await client.computers.navigate(id, { url: "https://books.toscrape.com" });

  for (let page = 1; page <= 3; page++) {
    const htmlResult = await client.computers.html(id);
    console.log(`Scraped page ${page}`);

    // Click next
    await client.computers.click(id, { x: 720, y: 780 });
    await new Promise((r) => setTimeout(r, 2000));
  }
} finally {
  await client.computers.delete(id);
}

Coordinates in these examples (like computer.click(720, 780)) are illustrative. Take a screenshot first to find the actual element positions on the page you’re scraping.

Use persistent sessions to maintain login state:

# First run: log in and save the session
with client.computer.create(kind="browser", persistent=True) as computer:
    computer.navigate("https://app.example.com/login")
    computer.wait(2)
    computer.click(400, 300)  # Username field
    computer.type("user@example.com")
    computer.click(400, 360)  # Password field
    computer.type("password123")
    computer.click(400, 420)  # Submit button
    computer.wait(3)
    session_id = computer.id
    print(f"Session saved: {session_id}")

# Later runs: restore the session and scrape
with client.computer.create(
    kind="browser",
    environment_id=session_id,
) as computer:
    computer.navigate("https://app.example.com/dashboard")
    computer.wait(2)
    html_result = computer.html()
    # Already logged in — cookies were restored

// First run: log in and save
const session = await client.computers.create({
  kind: "browser",
  persistent: true,
});
const id = session.id!;

await client.computers.navigate(id, { url: "https://app.example.com/login" });
await client.computers.click(id, { x: 400, y: 300 });
await client.computers.type(id, { text: "user@example.com" });
await client.computers.click(id, { x: 400, y: 360 });
await client.computers.type(id, { text: "password123" });
await client.computers.click(id, { x: 400, y: 420 });

await client.computers.delete(id);

// Later: restore and scrape
const restored = await client.computers.create({
  kind: "browser",
  environment_id: id,
});
await client.computers.navigate(restored.id!, {
  url: "https://app.example.com/dashboard",
});
// Already logged in

Let an AI agent scrape for you

For complex scraping tasks, let the agent figure out the navigation:

for event in client.agent.tasks.start_stream(
    instruction=(
        "Go to https://books.toscrape.com. "
        "Find all books in the 'Science' category. "
        "For each book, note the title and price. "
        "Report the results."
    ),
    kind="browser",
):
    print(event)

const stream = await client.agent.tasks.startStream({
  instruction:
    "Go to https://books.toscrape.com. " +
    "Find all books in the 'Science' category. " +
    "For each book, note the title and price. " +
    "Report the results.",
  kind: "browser",
});

for await (const event of stream) {
  console.log(event);
}

Tips for reliable scraping

Use use_advanced_proxy: true for sites with bot detection
Add computer.wait() after navigation to let pages fully render
Take screenshots before interacting to verify the page state
Use persistent sessions to avoid re-authenticating on every run
Use the Playwright integration when you need CSS selectors instead of coordinates

Web Scraping

Scrape a product listing page

Scrape paginated results

Let an AI agent scrape for you

Tips for reliable scraping

See also

What can I help you with?

Suggestions

Web Scraping

Scrape a product listing page

Scrape paginated results

Scrape behind a login

Let an AI agent scrape for you

Tips for reliable scraping

See also

What can I help you with?

Suggestions