---
title: Web Scraping | Lightcone
description: Extract structured data from any website, even behind logins and bot detection.
---

Use Lightcone to scrape websites that block simple HTTP requests. Lightcone’s stealth browsers render JavaScript, handle bot detection, and support residential proxies — so you can extract data from sites that tools like `requests` or `fetch` can’t reach.

Always check a site’s `robots.txt` and terms of service before scraping. Respect rate limits.

## Scrape a product listing page

scrape.py

```
from tzafon import Lightcone
import json


client = Lightcone()


with client.computer.create(
    kind="browser",
    use_advanced_proxy=True,
) as computer:
    # Navigate to the target page
    computer.navigate("https://books.toscrape.com/catalogue/category/books/science_22/index.html")
    computer.wait(2)


    # Extract the page HTML
    html_result = computer.html()
    html_content = computer.get_html_content(html_result)


    # Take a screenshot for debugging
    result = computer.screenshot()
    print(f"Screenshot: {computer.get_screenshot_url(result)}")
    print(f"HTML length: {len(html_content)} chars")
```

scrape.ts

```
import Lightcone from "@tzafon/lightcone";


const client = new Lightcone();
const computer = await client.computers.create({
  kind: "browser",
  use_advanced_proxy: true,
});
const id = computer.id!;


try {
  await client.computers.navigate(id, {
    url: "https://books.toscrape.com/catalogue/category/books/science_22/index.html",
  });


  const htmlResult = await client.computers.html(id);
  console.log("HTML retrieved");


  const screenshot = await client.computers.screenshot(id);
  console.log("Screenshot:", screenshot.result?.screenshot_url);
} finally {
  await client.computers.delete(id);
}
```

## Scrape paginated results

Handle pagination by clicking “Next” and collecting data across multiple pages:

```
with client.computer.create(kind="browser", use_advanced_proxy=True) as computer:
    computer.navigate("https://books.toscrape.com")
    computer.wait(2)


    all_pages_html = []


    for page_num in range(1, 4):  # Scrape first 3 pages
        html_result = computer.html()
        all_pages_html.append(computer.get_html_content(html_result))
        print(f"Scraped page {page_num}")


        # Click the "next" button
        try:
            computer.click(720, 780)  # Coordinates of the "next" button
            computer.wait(2)
        except Exception:
            print("No more pages")
            break


    print(f"Total pages scraped: {len(all_pages_html)}")
```

```
const computer = await client.computers.create({
  kind: "browser",
  use_advanced_proxy: true,
});
const id = computer.id!;
const allPagesHtml: string[] = [];


try {
  await client.computers.navigate(id, { url: "https://books.toscrape.com" });


  for (let page = 1; page <= 3; page++) {
    const htmlResult = await client.computers.html(id);
    console.log(`Scraped page ${page}`);


    // Click next
    await client.computers.click(id, { x: 720, y: 780 });
    await new Promise((r) => setTimeout(r, 2000));
  }
} finally {
  await client.computers.delete(id);
}
```

Coordinates in these examples (like `computer.click(720, 780)`) are illustrative. Take a screenshot first to find the actual element positions on the page you’re scraping.

## Scrape behind a login

Use persistent sessions to maintain login state:

```
# First run: log in and save the session
with client.computer.create(kind="browser", persistent=True) as computer:
    computer.navigate("https://app.example.com/login")
    computer.wait(2)
    computer.click(400, 300)  # Username field
    computer.type("user@example.com")
    computer.click(400, 360)  # Password field
    computer.type("password123")
    computer.click(400, 420)  # Submit button
    computer.wait(3)
    session_id = computer.id
    print(f"Session saved: {session_id}")


# Later runs: restore the session and scrape
with client.computer.create(
    kind="browser",
    environment_id=session_id,
) as computer:
    computer.navigate("https://app.example.com/dashboard")
    computer.wait(2)
    html_result = computer.html()
    # Already logged in — cookies were restored
```

```
// First run: log in and save
const session = await client.computers.create({
  kind: "browser",
  persistent: true,
});
const id = session.id!;


await client.computers.navigate(id, { url: "https://app.example.com/login" });
await client.computers.click(id, { x: 400, y: 300 });
await client.computers.type(id, { text: "user@example.com" });
await client.computers.click(id, { x: 400, y: 360 });
await client.computers.type(id, { text: "password123" });
await client.computers.click(id, { x: 400, y: 420 });


await client.computers.delete(id);


// Later: restore and scrape
const restored = await client.computers.create({
  kind: "browser",
  environment_id: id,
});
await client.computers.navigate(restored.id!, {
  url: "https://app.example.com/dashboard",
});
// Already logged in
```

## Let an AI agent scrape for you

For complex scraping tasks, let the agent figure out the navigation:

```
for event in client.agent.tasks.start_stream(
    instruction=(
        "Go to https://books.toscrape.com. "
        "Find all books in the 'Science' category. "
        "For each book, note the title and price. "
        "Report the results."
    ),
    kind="browser",
):
    print(event)
```

```
const stream = await client.agent.tasks.startStream({
  instruction:
    "Go to https://books.toscrape.com. " +
    "Find all books in the 'Science' category. " +
    "For each book, note the title and price. " +
    "Report the results.",
  kind: "browser",
});


for await (const event of stream) {
  console.log(event);
}
```

## Tips for reliable scraping

- **Use `use_advanced_proxy: true`** for sites with bot detection
- **Add `computer.wait()` after navigation** to let pages fully render
- **Take screenshots** before interacting to verify the page state
- **Use persistent sessions** to avoid re-authenticating on every run
- **Use the [Playwright integration](/integrations/playwright/index.md)** when you need CSS selectors instead of coordinates

Always check a site’s `robots.txt` and terms of service before scraping. Respect rate limits and be a good citizen.

## See also

- [**Form automation**](/use-cases/form-automation/index.md) — automate form filling on scraped sites
- [**Dashboard monitoring**](/use-cases/dashboard-monitoring/index.md) — periodic scraping for monitoring
- [**Playwright integration**](/integrations/playwright/index.md) — use CSS selectors for more reliable scraping
- [**Computers**](/guides/computers/index.md) — session configuration, proxies, and persistence