Tutorial: Scrape Behind a Login

Tutorials

Build a scraper that logs into a website, saves the session, and extracts data from authenticated pages.

In this tutorial, you’ll build a scraper that logs into a website, persists the session so you don’t have to log in again, and extracts data from pages that require authentication. Along the way, you’ll learn how to interact with forms, use persistent sessions, combine screenshots with HTML extraction, and build reusable scraping scripts.

Prerequisites: Complete the Quickstart and have TZAFON_API_KEY set in your environment.

Time: About 15 minutes.

What you’ll build

A script that:

Opens a browser and navigates to a login page
Fills in credentials and submits the form
Verifies the login succeeded with a screenshot
Saves the session for future runs
Restores the saved session and scrapes authenticated content

We’ll use Quotes to Scrape, a safe practice site with a login page.

from tzafon import Lightcone

client = Lightcone()

with client.computer.create(kind="browser") as computer:
    computer.navigate("https://quotes.toscrape.com/login")
    computer.wait(2)

    result = computer.screenshot()
    print(f"Login page: {computer.get_screenshot_url(result)}")

import Lightcone from "@tzafon/lightcone";

const client = new Lightcone();
const computer = await client.computers.create({ kind: "browser" });
const id = computer.id!;

try {
  await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
  await new Promise((r) => setTimeout(r, 2000));

  const result = await client.computers.screenshot(id);
  console.log("Login page:", result.result?.screenshot_url);
} finally {
  await client.computers.delete(id);
}

Open the screenshot URL to see the login form. Note the positions of the username field, password field, and submit button.

Now click each field, type the credentials, and submit. Quotes to Scrape accepts any username with the password “password”.

from tzafon import Lightcone

client = Lightcone()

with client.computer.create(kind="browser") as computer:
    computer.navigate("https://quotes.toscrape.com/login")
    computer.wait(2)

    # Fill in the username
    computer.click(580, 280)  # Username field
    computer.type("scraper")

    # Fill in the password
    computer.click(580, 330)  # Password field
    computer.type("password")

    # Submit
    computer.click(580, 390)  # Login button
    computer.wait(2)

    # Verify login succeeded
    result = computer.screenshot()
    print(f"After login: {computer.get_screenshot_url(result)}")

    # Check the page HTML for confirmation
    html_result = computer.html()
    content = computer.get_html_content(html_result)
    if "Logout" in content:
        print("Login successful!")
    else:
        print("Login may have failed — check the screenshot")

import Lightcone from "@tzafon/lightcone";

const client = new Lightcone();
const computer = await client.computers.create({ kind: "browser" });
const id = computer.id!;

try {
  await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
  await new Promise((r) => setTimeout(r, 2000));

  // Fill in the username
  await client.computers.click(id, { x: 580, y: 280 });
  await client.computers.type(id, { text: "scraper" });

  // Fill in the password
  await client.computers.click(id, { x: 580, y: 330 });
  await client.computers.type(id, { text: "password" });

  // Submit
  await client.computers.click(id, { x: 580, y: 390 });
  await new Promise((r) => setTimeout(r, 2000));

  // Verify
  const result = await client.computers.screenshot(id);
  console.log("After login:", result.result?.screenshot_url);

  const htmlResult = await client.computers.html(id);
  const content = htmlResult.result?.html_content as string;
  if (content.includes("Logout")) {
    console.log("Login successful!");
  } else {
    console.log("Login may have failed — check the screenshot");
  }
} finally {
  await client.computers.delete(id);
}

Step 3: Save the session for reuse

Creating a new browser and logging in every time is slow. Use a persistent session to save cookies and skip the login on future runs.

from tzafon import Lightcone

client = Lightcone()

# Log in and save the session
with client.computer.create(kind="browser", persistent=True) as computer:
    computer.navigate("https://quotes.toscrape.com/login")
    computer.wait(2)

    computer.click(580, 280)
    computer.type("scraper")
    computer.click(580, 330)
    computer.type("password")
    computer.click(580, 390)
    computer.wait(2)

    # Verify login
    html_result = computer.html()
    content = computer.get_html_content(html_result)
    if "Logout" not in content:
        raise Exception("Login failed")

    session_id = computer.id
    print(f"Session saved: {session_id}")
    # Save session_id to a file or database for later

import Lightcone from "@tzafon/lightcone";
import { writeFileSync } from "fs";

const client = new Lightcone();

// Log in and save the session
const computer = await client.computers.create({
  kind: "browser",
  persistent: true,
});
const id = computer.id!;

try {
  await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
  await new Promise((r) => setTimeout(r, 2000));

  await client.computers.click(id, { x: 580, y: 280 });
  await client.computers.type(id, { text: "scraper" });
  await client.computers.click(id, { x: 580, y: 330 });
  await client.computers.type(id, { text: "password" });
  await client.computers.click(id, { x: 580, y: 390 });
  await new Promise((r) => setTimeout(r, 2000));

  // Save session ID for later
  writeFileSync("session.txt", id);
  console.log(`Session saved: ${id}`);
} finally {
  await client.computers.delete(id);
}

Step 4: Restore the session and scrape

Now use the saved session ID to skip login and go straight to scraping.

import re
from tzafon import Lightcone

client = Lightcone()

# Replace with your saved session ID
session_id = "your_saved_session_id"

with client.computer.create(
    kind="browser",
    environment_id=session_id,
) as computer:
    # Go directly to an authenticated page
    computer.navigate("https://quotes.toscrape.com")
    computer.wait(2)

    # Verify we're still logged in
    html_result = computer.html()
    content = computer.get_html_content(html_result)

    if "Logout" not in content:
        print("Session expired — need to log in again")
    else:
        # Extract quotes
        quotes = re.findall(r'class="text" itemprop="text">(.*?)<', content)
        authors = re.findall(r'class="author" itemprop="author">(.*?)<', content)

        for quote, author in zip(quotes, authors):
            # Clean up HTML entities
            quote = quote.replace("&#8220;", '"').replace("&#8221;", '"')
            print(f"{quote} — {author}")

import Lightcone from "@tzafon/lightcone";
import { readFileSync } from "fs";

const client = new Lightcone();

// Restore the saved session
const sessionId = readFileSync("session.txt", "utf-8").trim();

const computer = await client.computers.create({
  kind: "browser",
  environment_id: sessionId,
});
const id = computer.id!;

try {
  await client.computers.navigate(id, { url: "https://quotes.toscrape.com" });
  await new Promise((r) => setTimeout(r, 2000));

  const htmlResult = await client.computers.html(id);
  const content = htmlResult.result?.html_content as string;

  if (!content.includes("Logout")) {
    console.log("Session expired — need to log in again");
  } else {
    // Extract quotes
    const quoteMatches = [...content.matchAll(/class="text" itemprop="text">(.*?)</g)];
    const authorMatches = [...content.matchAll(/class="author" itemprop="author">(.*?)</g)];

    for (let i = 0; i < quoteMatches.length; i++) {
      const quote = quoteMatches[i][1]
        .replace(/&#8220;/g, '"')
        .replace(/&#8221;/g, '"');
      const author = authorMatches[i]?.[1] ?? "Unknown";
      console.log(`${quote} — ${author}`);
    }
  }
} finally {
  await client.computers.delete(id);
}

You should see output like:

"The world as we have created it is a process of our thinking..." — Albert Einstein
"It is our choices, Harry, that show what we truly are..." — J.K. Rowling
"There are only two ways to live your life..." — Albert Einstein

For sites with complex login flows (CAPTCHAs, multi-factor auth, dynamic forms), let an AI agent handle the login instead of hardcoding coordinates:

for event in client.agent.tasks.start_stream(
    instruction=(
        "Go to https://quotes.toscrape.com/login. "
        "Log in with username 'scraper' and password 'password'. "
        "After logging in, extract all the quotes and their authors from the homepage. "
        "Report each quote with its author."
    ),
    kind="browser",
    max_steps=20,
):
    print(event)

const stream = await client.agent.tasks.startStream({
  instruction:
    "Go to https://quotes.toscrape.com/login. " +
    "Log in with username 'scraper' and password 'password'. " +
    "After logging in, extract all the quotes and their authors from the homepage. " +
    "Report each quote with its author.",
  kind: "browser",
  max_steps: 20,
});

for await (const event of stream) {
  console.log(event);
}

What you learned

In this tutorial, you:

Navigated to a login page and verified the layout with screenshots
Filled and submitted a form using click and type actions
Verified login success by checking the page HTML for expected content
Saved a persistent session so you don’t need to log in every time
Restored a session and scraped authenticated content
Used an AI agent as an alternative for complex login flows

Next steps

Web scraping — more scraping patterns including pagination and proxy support
Form automation — automate multi-step form workflows
Best practices — session management, error handling, and anti-detection patterns
Playwright integration — use CSS selectors for more reliable form targeting

Tutorial: Scrape Behind a Login

What you’ll build

Step 3: Save the session for reuse

Step 4: Restore the session and scrape

What you learned

Next steps

What can I help you with?

Suggestions

Tutorial: Scrape Behind a Login

What you’ll build

Step 1: Navigate to the login page

Step 2: Fill in the login form

Step 3: Save the session for reuse

Step 4: Restore the session and scrape

Step 5: Let an agent handle the login

What you learned

Next steps

What can I help you with?

Suggestions