map + search + scrape markdown bug

This commit is contained in:
rafaelsideguide 2024-08-16 17:57:11 -03:00
parent 3fcb21930e
commit 7a61325500
13 changed files with 74 additions and 86 deletions

View File

@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
LOGTAIL_KEY= LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs # set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY= LLAMAPARSE_API_KEY=
# set if you have a serper key you'd like to use as a search api
SERPER_API_KEY=
# set if you'd like to send slack server health status messages # set if you'd like to send slack server health status messages
SLACK_WEBHOOK_URL= SLACK_WEBHOOK_URL=
# set if you'd like to send posthog events like job logs # set if you'd like to send posthog events like job logs

View File

@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) {
const searchOptions = req.body.searchOptions ?? { limit: 5 }; const searchOptions = req.body.searchOptions ?? { limit: 5 };
const jobId = uuidv4(); const jobId = uuidv4();
try { try {

View File

@ -1,66 +1,63 @@
import { Request, Response } from "express"; import { Response } from "express";
import { Logger } from "../../../src/lib/logger"; import { v4 as uuidv4 } from "uuid";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { checkTeamCredits } from "../../services/billing/credit_billing"; import { MapResponse , MapRequest } from "./types";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
import { search } from "../../search";
import { checkAndUpdateURL } from "../../lib/validateUrl";
configDotenv();
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) { export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
req.body = mapRequestSchema.parse(req.body); req.body = mapRequestSchema.parse(req.body);
console.log(req.body);
// expected req.body
// req.body = { const id = uuidv4();
// url: string let links: string[] = [req.body.url];
// crawlerOptions:
// }
const crawlerOptions = legacyCrawlerOptions(req.body);
return res.status(200).json({ success: true, links: [ "test1", "test2" ] }); const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
};
// const mode = req.body.mode ?? "crawl"; const crawler = crawlToCrawler(id, sc);
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; try {
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
}
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => { if (sitemap !== null) {
// job.progress({ sitemap.map(x => { links.push(x.url); });
// current: progress.current, }
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({ const searchResults = await search({
// url: url, query: `site:${req.body.url}`,
// mode: mode ?? "crawl", // fix for single urls not working advanced: false,
// crawlerOptions: crawlerOptions, num_results: 50,
// team_id: team_id, lang: "en",
// pageOptions: pageOptions, country: "us",
// origin: req.body.origin ?? defaultOrigin, location: "United States",
// }); })
// await logCrawl(job.id.toString(), team_id); if (searchResults.length > 0) {
searchResults.map(x => { links.push(x.url); });
}
// res.json({ jobId: job.id }); links = links.map(x => checkAndUpdateURL(x).url);
links = [...new Set(links)];
return res.status(200).json({
success: true,
links
});
} }

View File

@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return { return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"), includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"), includeRawHtml: x.formats.includes("rawHtml"),
onlyIncludeTags: x.includeTags, onlyIncludeTags: x.includeTags,

View File

@ -11,6 +11,7 @@ export interface Progress {
} }
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean;
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean; includeRawHtml?: boolean;

View File

@ -123,6 +123,7 @@ export async function scrapSingleUrl(
jobId: string, jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
includeMarkdown: true,
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: false,
includeRawHtml: false, includeRawHtml: false,
@ -370,7 +371,7 @@ export async function scrapSingleUrl(
if (screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
@ -389,7 +390,7 @@ export async function scrapSingleUrl(
} else { } else {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
@ -416,7 +417,7 @@ export async function scrapSingleUrl(
}); });
return { return {
content: "", content: "",
markdown: "", markdown: pageOptions.includeMarkdown ? "" : undefined,
html: "", html: "",
linksOnPage: pageOptions.includeLinks ? [] : undefined, linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: { metadata: {

View File

@ -4,42 +4,41 @@ import { SearchResult } from "../../src/lib/entities";
dotenv.config(); dotenv.config();
export async function serper_search(q, options: { export async function fireEngineSearch(q: string, options: {
tbs?: string; tbs?: string;
filter?: string; filter?: string;
lang?: string; lang?: string;
country?: string; country?: string;
location?: string; location?: string;
num_results: number; numResults: number;
page?: number; page?: number;
}): Promise<SearchResult[]> { }): Promise<SearchResult[]> {
let data = JSON.stringify({ let data = JSON.stringify({
q: q, q: q,
hl: options.lang, lang: options.lang,
gl: options.country, country: options.country,
location: options.location, location: options.location,
tbs: options.tbs, tbs: options.tbs,
num: options.num_results, num: options.numResults,
page: options.page ?? 1, page: options.page ?? 1,
}); });
if (!process.env.FIRE_ENGINE_BETA_URL) {
return [];
}
let config = { let config = {
method: "POST", method: "POST",
url: "https://google.serper.dev/search", url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: { headers: {
"X-API-KEY": process.env.SERPER_API_KEY,
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
data: data, data: data,
}; };
const response = await axios(config); const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) { if (response && response.data) {
return response.data.organic.map((a) => ({ return response.data
url: a.link, } else {
title: a.title,
description: a.snippet,
}));
}else{
return []; return [];
} }
} }

View File

@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> { export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
let proxies = null; let proxies = null;
if (proxy) { if (proxy) {
if (proxy.startsWith("https")) { if (proxy.startsWith("https")) {

View File

@ -1,10 +1,7 @@
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities"; import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch"; import { googleSearch } from "./googlesearch";
import { serper_search } from "./serper"; import { fireEngineSearch } from "./fireEngine";
export async function search({ export async function search({
query, query,
@ -32,10 +29,10 @@ export async function search({
timeout?: number; timeout?: number;
}) : Promise<SearchResult[]> { }) : Promise<SearchResult[]> {
try { try {
if (process.env.SERPER_API_KEY ) { if (process.env.FIRE_ENGINE_BETA_URL) {
return await serper_search(query, {num_results, tbs, filter, lang, country, location}); return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
} }
return await google_search( return await googleSearch(
query, query,
advanced, advanced,
num_results, num_results,
@ -51,5 +48,4 @@ export async function search({
Logger.error(`Error in search function: ${error}`); Logger.error(`Error in search function: ${error}`);
return [] return []
} }
// if process.env.SERPER_API_KEY is set, use serper
} }

View File

@ -15,7 +15,6 @@ x-common-service: &common-service
- OPENAI_BASE_URL=${OPENAI_BASE_URL} - OPENAI_BASE_URL=${OPENAI_BASE_URL}
- MODEL_NAME=${MODEL_NAME:-gpt-4o} - MODEL_NAME=${MODEL_NAME:-gpt-4o}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY} - LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY} - BULL_AUTH_KEY=${BULL_AUTH_KEY}

View File

@ -6,7 +6,6 @@ type: Opaque
data: data:
OPENAI_API_KEY: "" OPENAI_API_KEY: ""
SLACK_WEBHOOK_URL: "" SLACK_WEBHOOK_URL: ""
SERPER_API_KEY: ""
LLAMAPARSE_API_KEY: "" LLAMAPARSE_API_KEY: ""
LOGTAIL_KEY: "" LOGTAIL_KEY: ""
BULL_AUTH_KEY: "" BULL_AUTH_KEY: ""