firecrawl/apps/test-suite/tests/scrape.test.ts

190 lines
7.1 KiB
TypeScript
Raw Normal View History

import request from "supertest";
import dotenv from "dotenv";
2024-05-15 18:50:50 +00:00
import { numTokensFromString } from "../utils/tokens";
2024-05-08 18:38:46 +00:00
import OpenAI from "openai";
2024-05-15 18:50:50 +00:00
import { WebsiteScrapeError } from "../utils/types";
import { logErrors } from "../utils/log";
2024-05-08 19:36:54 +00:00
2024-05-15 18:50:50 +00:00
import websitesData from "../data/scrape.json";
2024-05-08 18:38:46 +00:00
import "dotenv/config";
2024-05-15 18:50:50 +00:00
import fs from 'fs';
dotenv.config();
2024-05-08 18:38:46 +00:00
interface WebsiteData {
website: string;
prompt: string;
expected_output: string;
}
2024-05-08 19:36:54 +00:00
const TEST_URL = "http://127.0.0.1:3002";
2024-05-15 18:50:50 +00:00
describe("Scraping Checkup (E2E)", () => {
beforeAll(() => {
if (!process.env.TEST_API_KEY) {
throw new Error("TEST_API_KEY is not set");
}
if (!process.env.OPENAI_API_KEY) {
throw new Error("OPENAI_API_KEY is not set");
}
});
2024-05-08 19:18:53 +00:00
describe("Scraping website tests with a dataset", () => {
it("Should scrape the website and prompt it against OpenAI", async () => {
2024-09-03 13:56:07 +00:00
let totalTimeTaken = 0;
2024-05-08 18:38:46 +00:00
let passedTests = 0;
2024-05-08 19:18:53 +00:00
const batchSize = 15; // Adjusted to comply with the rate limit of 15 per minute
2024-05-08 18:38:46 +00:00
const batchPromises = [];
let totalTokens = 0;
2024-05-08 18:38:46 +00:00
const startTime = new Date().getTime();
const date = new Date();
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
const errorLog: WebsiteScrapeError[] = [];
2024-05-08 18:38:46 +00:00
for (let i = 0; i < websitesData.length; i += batchSize) {
2024-05-08 19:18:53 +00:00
// Introducing delay to respect the rate limit of 15 requests per minute
await new Promise(resolve => setTimeout(resolve, 10000));
2024-05-08 18:38:46 +00:00
const batch = websitesData.slice(i, i + batchSize);
const batchPromise = Promise.all(
batch.map(async (websiteData: WebsiteData) => {
try {
2024-09-03 13:56:07 +00:00
const startTime = new Date().getTime();
2024-05-08 19:36:54 +00:00
const scrapedContent = await request(TEST_URL || "")
2024-09-03 13:56:07 +00:00
.post("/v1/scrape")
2024-05-08 18:38:46 +00:00
.set("Content-Type", "application/json")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
2024-09-03 13:56:07 +00:00
.send({ url: websiteData.website });
const endTime = new Date().getTime();
const timeTaken = endTime - startTime;
totalTimeTaken += timeTaken;
2024-05-08 18:38:46 +00:00
if (scrapedContent.statusCode !== 200) {
2024-05-08 20:23:53 +00:00
console.error(`Failed to scrape ${websiteData.website} ${scrapedContent.statusCode}`);
2024-05-08 20:00:20 +00:00
errorLog.push({
website: websiteData.website,
prompt: websiteData.prompt,
expected_output: websiteData.expected_output,
actual_output: "",
2024-05-08 20:23:53 +00:00
error: `Failed to scrape website. ${scrapedContent.statusCode} ${scrapedContent.body.error}`
2024-05-08 20:00:20 +00:00
});
2024-05-08 18:38:46 +00:00
return null;
}
2024-05-08 18:38:46 +00:00
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const prompt = `Based on this markdown extracted from a website html page, ${websiteData.prompt} Just say 'yes' or 'no' to the question.\nWebsite markdown: ${scrapedContent.body.data.markdown}\n`;
2024-05-08 20:13:38 +00:00
let msg = null;
const maxRetries = 3;
let attempts = 0;
while (!msg && attempts < maxRetries) {
try {
msg = await openai.chat.completions.create({
model: "gpt-4-turbo",
max_tokens: 100,
temperature: 0,
messages: [
{
role: "user",
content: prompt
},
],
});
} catch (error) {
console.error(`Attempt ${attempts + 1}: Failed to prompt for ${websiteData.website}, error: ${error}`);
attempts++;
if (attempts < maxRetries) {
console.log(`Retrying... Attempt ${attempts + 1}`);
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds before retrying
}
}
}
2024-05-08 18:38:46 +00:00
if (!msg) {
2024-05-08 20:13:38 +00:00
console.error(`Failed to prompt for ${websiteData.website} after ${maxRetries} attempts`);
2024-05-08 18:38:46 +00:00
errorLog.push({
website: websiteData.website,
prompt: websiteData.prompt,
expected_output: websiteData.expected_output,
actual_output: "",
error: "Failed to prompt... model error."
});
return null;
}
2024-05-08 18:38:46 +00:00
const actualOutput = (msg.choices[0].message.content ?? "").toLowerCase()
const expectedOutput = websiteData.expected_output.toLowerCase();
const numTokens = numTokensFromString(prompt,"gpt-4") + numTokensFromString(actualOutput,"gpt-4");
totalTokens += numTokens;
if (actualOutput.includes(expectedOutput)) {
passedTests++;
} else {
console.error(
`This website failed the test: ${websiteData.website}`
);
console.error(`Actual output: ${actualOutput}`);
errorLog.push({
website: websiteData.website,
prompt: websiteData.prompt,
expected_output: websiteData.expected_output,
actual_output: actualOutput,
error: "Output mismatch"
});
}
2024-05-08 18:38:46 +00:00
return {
website: websiteData.website,
prompt: websiteData.prompt,
expectedOutput,
actualOutput,
};
} catch (error) {
console.error(
`Error processing ${websiteData.website}: ${error}`
);
2024-05-08 20:00:20 +00:00
errorLog.push({
website: websiteData.website,
prompt: websiteData.prompt,
expected_output: websiteData.expected_output,
actual_output: "",
2024-05-08 20:23:53 +00:00
error: `Error processing ${websiteData.website}: ${error}`
2024-05-08 20:00:20 +00:00
});
2024-05-08 18:38:46 +00:00
return null;
}
})
);
batchPromises.push(batchPromise);
}
2024-05-08 19:26:04 +00:00
(await Promise.all(batchPromises)).flat();
const score = (passedTests / websitesData.length) * 100;
2024-05-08 18:38:46 +00:00
const endTime = new Date().getTime();
const timeTaken = (endTime - startTime) / 1000;
console.log(`Score: ${score}%`);
console.log(`Total tokens: ${totalTokens}`);
2024-09-03 13:56:07 +00:00
console.log(`Total time taken: ${totalTimeTaken} miliseconds`);
2024-05-08 18:38:46 +00:00
2024-05-08 19:26:04 +00:00
await logErrors(errorLog, timeTaken, totalTokens, score, websitesData.length);
2024-05-08 19:18:53 +00:00
if (process.env.ENV === "local" && errorLog.length > 0) {
2024-05-08 18:38:46 +00:00
if (!fs.existsSync(logsDir)){
fs.mkdirSync(logsDir, { recursive: true });
}
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
}
2024-05-08 19:18:53 +00:00
2024-05-15 19:11:16 +00:00
expect(score).toBeGreaterThanOrEqual(70);
2024-05-08 19:18:53 +00:00
}, 350000); // 150 seconds timeout
2024-05-08 18:38:46 +00:00
});
});