Merge pull request #14 from mendableai/nsc/clean-content

Option to extract only the main content, excluding headers, navs, footers etc.
This commit is contained in:
Nicolas 2024-04-17 21:40:47 -04:00 committed by GitHub
commit 7ce2dd976f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 119 additions and 20 deletions

View File

@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => {
return res.status(400).json({ error: "Url is required" });
}
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
try {
const a = new WebScraperDataProvider();
await a.setOptions({
@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => {
crawlerOptions: {
...crawlerOptions,
},
pageOptions: pageOptions,
});
const docs = await a.getDocuments(false);
@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => {
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
if (mode === "single_urls" && !url.includes(",")) {
try {
@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => {
crawlerOptions: {
returnOnlyUrls: true,
},
pageOptions: pageOptions,
});
const docs = await a.getDocuments(false, (progress) => {
@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => {
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions },
team_id: team_id,
pageOptions: pageOptions,
});
res.json({ jobId: job.id });
@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => {
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview",
pageOptions: pageOptions,
});
res.json({ jobId: job.id });

View File

@ -9,6 +9,24 @@ export interface Progress {
currentDocumentUrl?: string;
}
export type PageOptions = {
onlyMainContent?: boolean;
};
export type WebScraperOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
generateImgAltText?: boolean;
};
pageOptions?: PageOptions;
concurrentRequests?: number;
};
export class Document {
id?: string;
content: string;

View File

@ -13,6 +13,7 @@ export async function startWebScraperPipeline({
url: job.data.url,
mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
inProgress: (progress) => {
job.progress(progress);
},
@ -29,6 +30,7 @@ export async function runWebScraper({
url,
mode,
crawlerOptions,
pageOptions,
inProgress,
onSuccess,
onError,
@ -37,6 +39,7 @@ export async function runWebScraper({
url: string;
mode: "crawl" | "single_urls" | "sitemap";
crawlerOptions: any;
pageOptions?: any;
inProgress: (progress: any) => void;
onSuccess: (result: any) => void;
onError: (error: any) => void;
@ -44,18 +47,19 @@ export async function runWebScraper({
}): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
try {
const provider = new WebScraperDataProvider();
if (mode === "crawl") {
await provider.setOptions({
mode: mode,
urls: [url],
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
});
} else {
await provider.setOptions({
mode: mode,
urls: url.split(","),
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
});
}
const docs = (await provider.getDocuments(false, (progress: Progress) => {

View File

@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => {
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](./another-image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](./another-image.webp)",
},
{
metadata: { sourceURL: "https://example.com/data-image" },
content: "![data image](data:image/png;base64,...)",
@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => {
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](https://example.com/another-image.png)",
},
{
metadata: { sourceURL: "https://example.com/another-page" },
content: "![another alt text](https://example.com/another-image.webp)",
},
{
metadata: { sourceURL: "https://example.com/data-image" },
content: "![data image](data:image/png;base64,...)",

View File

@ -1,4 +1,4 @@
import { Document } from "../../lib/entities";
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/gptVision";
export type WebScraperOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
generateImgAltText?: boolean;
};
concurrentRequests?: number;
};
export class WebScraperDataProvider {
private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@ -29,6 +17,7 @@ export class WebScraperDataProvider {
private limit: number = 10000;
private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
private pageOptions?: PageOptions;
authorize(): void {
throw new Error("Method not implemented.");
@ -51,7 +40,7 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all(
batchUrls.map(async (url, index) => {
const result = await scrapSingleUrl(url, true);
const result = await scrapSingleUrl(url, true, this.pageOptions);
processedUrls++;
if (inProgress) {
inProgress({
@ -321,6 +310,7 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");

View File

@ -2,9 +2,10 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document } from "../../lib/entities";
import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { parseTablesToMarkdown } from "./utils/parseTable";
import { excludeNonMainTags } from "./utils/excludeTags";
// import puppeteer from "puppeteer";
dotenv.config();
@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl(
urlToScrap: string,
toMarkdown: boolean = true
toMarkdown: boolean = true,
pageOptions: PageOptions = { onlyMainContent: true }
): Promise<Document> {
console.log(`Scraping URL: ${urlToScrap}`);
urlToScrap = urlToScrap.trim();
const removeUnwantedElements = (html: string) => {
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => {
soup(tag).remove();
});
}
return soup.html();
};
@ -133,7 +141,7 @@ export async function scrapSingleUrl(
}
break;
}
let cleanedHtml = removeUnwantedElements(text);
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
};

View File

@ -0,0 +1,60 @@
export const excludeNonMainTags = [
"header",
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
".form",
"form",
"#search-form",
".search",
"#search",
".share",
"#share",
".pagination",
"#pagination",
".widget",
"#widget",
".related",
"#related",
".tag",
"#tag",
".category",
"#category",
".comment",
"#comment",
".reply",
"#reply",
".author",
"#author",
];

View File

@ -20,7 +20,9 @@ export interface WebScraperOptions {
url: string;
mode: "crawl" | "single_urls" | "sitemap";
crawlerOptions: any;
pageOptions: any;
team_id: string;
}