Merge pull request #332 from mendableai/feat/rawHtmlExtraction
Adds pageOptions.includeRawHtml and new extraction mode "llm-extraction-from-raw-html"
This commit is contained in:
commit
42cd58a679
|
@ -6801,7 +6801,7 @@ packages:
|
|||
handlebars: 4.7.8
|
||||
openai: 3.3.0
|
||||
sbd: 1.0.19
|
||||
typescript: 5.4.5
|
||||
typescript: 5.5.3
|
||||
uuid: 9.0.1
|
||||
zod: 3.23.8
|
||||
transitivePeerDependencies:
|
||||
|
@ -7767,6 +7767,12 @@ packages:
|
|||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
/typescript@5.5.3:
|
||||
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
dev: false
|
||||
|
||||
/typesense@1.8.2(@babel/runtime@7.24.6):
|
||||
resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==}
|
||||
engines: {node: '>=18'}
|
||||
|
|
|
@ -131,6 +131,28 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and includeRawHtml set to true", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://roastmywebsite.ai",
|
||||
pageOptions: { includeRawHtml: true },
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.content).toContain("_Roast_");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.rawHtml).toContain("<h1");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
|
@ -1177,6 +1199,47 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(llmExtraction.is_open_source).toBe(false);
|
||||
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||
}, 60000); // 60 secs
|
||||
|
||||
it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
|
||||
extractorOptions: {
|
||||
mode: "llm-extraction-from-raw-html",
|
||||
extractionPrompt:
|
||||
"Based on the information on the page, what are the primary and secondary CTA buttons?",
|
||||
extractionSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
primary_cta: {
|
||||
type: "string",
|
||||
},
|
||||
secondary_cta: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
required: ["primary_cta", "secondary_cta"],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
||||
let llmExtraction = response.body.data.llm_extraction;
|
||||
|
||||
// Check if the llm_extraction object has the required properties with correct types and values
|
||||
expect(llmExtraction).toHaveProperty("primary_cta");
|
||||
expect(typeof llmExtraction.primary_cta).toBe("string");
|
||||
expect(llmExtraction).toHaveProperty("secondary_cta");
|
||||
expect(typeof llmExtraction.secondary_cta).toBe("string");
|
||||
|
||||
}, 60000); // 60 secs
|
||||
});
|
||||
|
||||
// describe("POST /v0/scrape for Top 100 Companies", () => {
|
||||
|
|
|
@ -58,19 +58,27 @@ export async function scrapeHelper(
|
|||
}
|
||||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
let filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||
}
|
||||
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
filteredDocs.forEach(doc => {
|
||||
delete doc.rawHtml;
|
||||
});
|
||||
}
|
||||
|
||||
let creditsToBeBilled = filteredDocs.length;
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
|
||||
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
|
||||
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
}
|
||||
|
||||
|
|
|
@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities";
|
|||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions
|
||||
extractionOptions: ExtractorOptions,
|
||||
mode: "markdown" | "raw-html"
|
||||
): Promise<Document[]> {
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
|
@ -28,6 +29,7 @@ export async function generateCompletions(
|
|||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt,
|
||||
mode: mode,
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
const validate = ajv.compile(schema);
|
||||
|
|
|
@ -13,26 +13,37 @@ const defaultPrompt =
|
|||
"You are a professional web scraper. Extract the contents of the webpage";
|
||||
|
||||
function prepareOpenAIDoc(
|
||||
document: Document
|
||||
document: Document,
|
||||
mode: "markdown" | "raw-html"
|
||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
||||
|
||||
let markdown = document.markdown;
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!markdown) {
|
||||
let extractionTarget = document.markdown;
|
||||
|
||||
if (mode === "raw-html") {
|
||||
extractionTarget = document.rawHtml;
|
||||
}
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!extractionTarget) {
|
||||
throw new Error(
|
||||
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
|
||||
`${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// count number of tokens
|
||||
const numTokens = numTokensFromString(document.markdown, "gpt-4");
|
||||
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
markdown = markdown.slice(0, (maxTokens * modifier));
|
||||
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
||||
}
|
||||
|
||||
return [[{ type: "text", text: markdown }], numTokens];
|
||||
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
|
@ -42,6 +53,7 @@ export async function generateOpenAICompletions({
|
|||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
temperature,
|
||||
mode
|
||||
}: {
|
||||
client: OpenAI;
|
||||
model?: string;
|
||||
|
@ -49,9 +61,10 @@ export async function generateOpenAICompletions({
|
|||
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string;
|
||||
temperature?: number;
|
||||
mode: "markdown" | "raw-html";
|
||||
}): Promise<Document> {
|
||||
const openai = client as OpenAI;
|
||||
const [content, numTokens] = prepareOpenAIDoc(document);
|
||||
const [content, numTokens] = prepareOpenAIDoc(document, mode);
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
|
|
|
@ -13,6 +13,7 @@ export interface Progress {
|
|||
export type PageOptions = {
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
fallback?: boolean;
|
||||
fetchPageContent?: boolean;
|
||||
waitFor?: number;
|
||||
|
@ -25,7 +26,7 @@ export type PageOptions = {
|
|||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
mode: "markdown" | "llm-extraction";
|
||||
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any>;
|
||||
}
|
||||
|
@ -73,6 +74,7 @@ export class Document {
|
|||
content: string;
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
llm_extraction?: Record<string, any>;
|
||||
createdAt?: Date;
|
||||
updatedAt?: Date;
|
||||
|
|
|
@ -66,6 +66,7 @@ export class WebScraperDataProvider {
|
|||
const result = await scrapSingleUrl(
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML
|
||||
);
|
||||
processedUrls++;
|
||||
|
@ -269,10 +270,16 @@ export class WebScraperDataProvider {
|
|||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
if (
|
||||
this.extractorOptions.mode === "llm-extraction" &&
|
||||
(this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions);
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "markdown");
|
||||
}
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
|
|||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||
import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
|
@ -348,10 +348,14 @@ export async function scrapSingleUrl(
|
|||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown"
|
||||
},
|
||||
existingHtml: string = ""
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
@ -517,8 +521,10 @@ export async function scrapSingleUrl(
|
|||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
}
|
||||
if (attempt.pageError) {
|
||||
if (attempt.pageError && attempt.pageStatusCode != 200) {
|
||||
pageError = attempt.pageError;
|
||||
} else {
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) break;
|
||||
|
@ -542,6 +548,7 @@ export async function scrapSingleUrl(
|
|||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
|
@ -555,6 +562,7 @@ export async function scrapSingleUrl(
|
|||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml: pageOptions.includeRawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
sourceURL: urlToScrap,
|
||||
|
|
Loading…
Reference in New Issue