Skip to content
Dashboard

Tutorial: Scrape Behind a Login

Build a scraper that logs into a website, saves the session, and extracts data from authenticated pages.

In this tutorial, you’ll build a scraper that logs into a website, persists the session so you don’t have to log in again, and extracts data from pages that require authentication. Along the way, you’ll learn how to interact with forms, use persistent sessions, combine screenshots with HTML extraction, and build reusable scraping scripts.

Prerequisites: Complete the Quickstart and have TZAFON_API_KEY set in your environment.

Time: About 15 minutes.

A script that:

  1. Opens a browser and navigates to a login page
  2. Fills in credentials and submits the form
  3. Verifies the login succeeded with a screenshot
  4. Saves the session for future runs
  5. Restores the saved session and scrapes authenticated content

We’ll use Quotes to Scrape, a safe practice site with a login page.

scrape_login.py
from tzafon import Lightcone
client = Lightcone()
with client.computer.create(kind="browser") as computer:
computer.navigate("https://quotes.toscrape.com/login")
computer.wait(2)
result = computer.screenshot()
print(f"Login page: {computer.get_screenshot_url(result)}")
scrape_login.ts
import Lightcone from "@tzafon/lightcone";
const client = new Lightcone();
const computer = await client.computers.create({ kind: "browser" });
const id = computer.id!;
try {
await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
await new Promise((r) => setTimeout(r, 2000));
const result = await client.computers.screenshot(id);
console.log("Login page:", result.result?.screenshot_url);
} finally {
await client.computers.delete(id);
}

Open the screenshot URL to see the login form. Note the positions of the username field, password field, and submit button.

Now click each field, type the credentials, and submit. Quotes to Scrape accepts any username with the password “password”.

scrape_login.py
from tzafon import Lightcone
client = Lightcone()
with client.computer.create(kind="browser") as computer:
computer.navigate("https://quotes.toscrape.com/login")
computer.wait(2)
# Fill in the username
computer.click(580, 280) # Username field
computer.type("scraper")
# Fill in the password
computer.click(580, 330) # Password field
computer.type("password")
# Submit
computer.click(580, 390) # Login button
computer.wait(2)
# Verify login succeeded
result = computer.screenshot()
print(f"After login: {computer.get_screenshot_url(result)}")
# Check the page HTML for confirmation
html_result = computer.html()
content = computer.get_html_content(html_result)
if "Logout" in content:
print("Login successful!")
else:
print("Login may have failed — check the screenshot")
scrape_login.ts
import Lightcone from "@tzafon/lightcone";
const client = new Lightcone();
const computer = await client.computers.create({ kind: "browser" });
const id = computer.id!;
try {
await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
await new Promise((r) => setTimeout(r, 2000));
// Fill in the username
await client.computers.click(id, { x: 580, y: 280 });
await client.computers.type(id, { text: "scraper" });
// Fill in the password
await client.computers.click(id, { x: 580, y: 330 });
await client.computers.type(id, { text: "password" });
// Submit
await client.computers.click(id, { x: 580, y: 390 });
await new Promise((r) => setTimeout(r, 2000));
// Verify
const result = await client.computers.screenshot(id);
console.log("After login:", result.result?.screenshot_url);
const htmlResult = await client.computers.html(id);
const content = htmlResult.result?.html_content as string;
if (content.includes("Logout")) {
console.log("Login successful!");
} else {
console.log("Login may have failed — check the screenshot");
}
} finally {
await client.computers.delete(id);
}

Creating a new browser and logging in every time is slow. Use a persistent session to save cookies and skip the login on future runs.

scrape_login.py
from tzafon import Lightcone
client = Lightcone()
# Log in and save the session
with client.computer.create(kind="browser", persistent=True) as computer:
computer.navigate("https://quotes.toscrape.com/login")
computer.wait(2)
computer.click(580, 280)
computer.type("scraper")
computer.click(580, 330)
computer.type("password")
computer.click(580, 390)
computer.wait(2)
# Verify login
html_result = computer.html()
content = computer.get_html_content(html_result)
if "Logout" not in content:
raise Exception("Login failed")
session_id = computer.id
print(f"Session saved: {session_id}")
# Save session_id to a file or database for later
scrape_login.ts
import Lightcone from "@tzafon/lightcone";
import { writeFileSync } from "fs";
const client = new Lightcone();
// Log in and save the session
const computer = await client.computers.create({
kind: "browser",
persistent: true,
});
const id = computer.id!;
try {
await client.computers.navigate(id, { url: "https://quotes.toscrape.com/login" });
await new Promise((r) => setTimeout(r, 2000));
await client.computers.click(id, { x: 580, y: 280 });
await client.computers.type(id, { text: "scraper" });
await client.computers.click(id, { x: 580, y: 330 });
await client.computers.type(id, { text: "password" });
await client.computers.click(id, { x: 580, y: 390 });
await new Promise((r) => setTimeout(r, 2000));
// Save session ID for later
writeFileSync("session.txt", id);
console.log(`Session saved: ${id}`);
} finally {
await client.computers.delete(id);
}

Now use the saved session ID to skip login and go straight to scraping.

scrape_login.py
import re
from tzafon import Lightcone
client = Lightcone()
# Replace with your saved session ID
session_id = "your_saved_session_id"
with client.computer.create(
kind="browser",
environment_id=session_id,
) as computer:
# Go directly to an authenticated page
computer.navigate("https://quotes.toscrape.com")
computer.wait(2)
# Verify we're still logged in
html_result = computer.html()
content = computer.get_html_content(html_result)
if "Logout" not in content:
print("Session expired — need to log in again")
else:
# Extract quotes
quotes = re.findall(r'class="text" itemprop="text">(.*?)<', content)
authors = re.findall(r'class="author" itemprop="author">(.*?)<', content)
for quote, author in zip(quotes, authors):
# Clean up HTML entities
quote = quote.replace("&#8220;", '"').replace("&#8221;", '"')
print(f"{quote}{author}")
scrape_login.ts
import Lightcone from "@tzafon/lightcone";
import { readFileSync } from "fs";
const client = new Lightcone();
// Restore the saved session
const sessionId = readFileSync("session.txt", "utf-8").trim();
const computer = await client.computers.create({
kind: "browser",
environment_id: sessionId,
});
const id = computer.id!;
try {
await client.computers.navigate(id, { url: "https://quotes.toscrape.com" });
await new Promise((r) => setTimeout(r, 2000));
const htmlResult = await client.computers.html(id);
const content = htmlResult.result?.html_content as string;
if (!content.includes("Logout")) {
console.log("Session expired — need to log in again");
} else {
// Extract quotes
const quoteMatches = [...content.matchAll(/class="text" itemprop="text">(.*?)</g)];
const authorMatches = [...content.matchAll(/class="author" itemprop="author">(.*?)</g)];
for (let i = 0; i < quoteMatches.length; i++) {
const quote = quoteMatches[i][1]
.replace(/&#8220;/g, '"')
.replace(/&#8221;/g, '"');
const author = authorMatches[i]?.[1] ?? "Unknown";
console.log(`${quote} — ${author}`);
}
}
} finally {
await client.computers.delete(id);
}

You should see output like:

"The world as we have created it is a process of our thinking..." — Albert Einstein
"It is our choices, Harry, that show what we truly are..." — J.K. Rowling
"There are only two ways to live your life..." — Albert Einstein

For sites with complex login flows (CAPTCHAs, multi-factor auth, dynamic forms), let an AI agent handle the login instead of hardcoding coordinates:

for event in client.agent.tasks.start_stream(
instruction=(
"Go to https://quotes.toscrape.com/login. "
"Log in with username 'scraper' and password 'password'. "
"After logging in, extract all the quotes and their authors from the homepage. "
"Report each quote with its author."
),
kind="browser",
max_steps=20,
):
print(event)
const stream = await client.agent.tasks.startStream({
instruction:
"Go to https://quotes.toscrape.com/login. " +
"Log in with username 'scraper' and password 'password'. " +
"After logging in, extract all the quotes and their authors from the homepage. " +
"Report each quote with its author.",
kind: "browser",
max_steps: 20,
});
for await (const event of stream) {
console.log(event);
}

In this tutorial, you:

  1. Navigated to a login page and verified the layout with screenshots
  2. Filled and submitted a form using click and type actions
  3. Verified login success by checking the page HTML for expected content
  4. Saved a persistent session so you don’t need to log in every time
  5. Restored a session and scraped authenticated content
  6. Used an AI agent as an alternative for complex login flows