map + search + scrape markdown bug

2024-08-16 17:57:11 -03:00 · 2024-08-16 17:57:11 -03:00 · 7a61325500
parent 3fcb21930e
commit 7a61325500
13 changed files with 74 additions and 86 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
 LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
 PLAYWRIGHT_MICROSERVICE_URL=  # set if you'd like to run a playwright fallback
 LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
-SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
 SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
 POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
 POSTHOG_HOST= # set if you'd like to send posthog events like job logs
--- a/SELF_HOST.md
+++ b/SELF_HOST.md
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
 LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
 PLAYWRIGHT_MICROSERVICE_URL=  # set if you'd like to run a playwright fallback
 LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
-SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
 SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
 POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
 POSTHOG_HOST= # set if you'd like to send posthog events like job logs
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
 LOGTAIL_KEY=
 # set if you have a llamaparse key you'd like to use to parse pdfs
 LLAMAPARSE_API_KEY=
-# set if you have a serper key you'd like to use as a search api
-SERPER_API_KEY=
 # set if you'd like to send slack server health status messages
 SLACK_WEBHOOK_URL=
 # set if you'd like to send posthog events like job logs
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) {

    const searchOptions = req.body.searchOptions ?? { limit: 5 };
    
-
    const jobId = uuidv4();

    try {
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -1,66 +1,63 @@
-import { Request, Response } from "express";
-import { Logger } from "../../../src/lib/logger";
-import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
-import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
-import { checkTeamCredits } from "../../services/billing/credit_billing";
+import { Response } from "express";
+import { v4 as uuidv4 } from "uuid";
+import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
+import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
+import { MapResponse , MapRequest } from "./types";
+import { Logger } from "../../lib/logger";
+import { configDotenv } from "dotenv";
+import { search } from "../../search";
+import { checkAndUpdateURL } from "../../lib/validateUrl";
+
+configDotenv();

 export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
  req.body = mapRequestSchema.parse(req.body);
-  console.log(req.body);
-  // expected req.body

-  // req.body = {
-  //   url: string
-  //   crawlerOptions: 
-  // }
+  const id = uuidv4();
+  let links: string[] = [req.body.url];

+  const crawlerOptions = legacyCrawlerOptions(req.body);

-  return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
+  const sc: StoredCrawl = {
+    originUrl: req.body.url,
+    crawlerOptions,
+    pageOptions: {},
+    team_id: req.auth.team_id,
+    createdAt: Date.now(),
+  };

-  // const mode = req.body.mode ?? "crawl";
+  const crawler = crawlToCrawler(id, sc);

-  // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
-  // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
+  try {
+    sc.robots = await crawler.getRobotsTxt();
+  } catch (e) {
+    Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
+  }

-  // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
-  //   try {
-  //     const a = new WebScraperDataProvider();
-  //     await a.setOptions({
-  //       jobId: uuidv4(),
-  //       mode: "single_urls",
-  //       urls: [url],
-  //       crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
-  //       pageOptions: pageOptions,
-  //     });
+  const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();

-  //     const docs = await a.getDocuments(false, (progress) => {
-  //       job.progress({
-  //         current: progress.current,
-  //         total: progress.total,
-  //         current_step: "SCRAPING",
-  //         current_url: progress.currentDocumentUrl,
-  //       });
-  //     });
-  //     return res.json({
-  //       success: true,
-  //       documents: docs,
-  //     });
-  //   } catch (error) {
-  //     Logger.error(error);
-  //     return res.status(500).json({ error: error.message });
-  //   }
-  // }
+  if (sitemap !== null) {
+    sitemap.map(x => { links.push(x.url); });
+  }

-  // const job = await addWebScraperJob({
-  //   url: url,
-  //   mode: mode ?? "crawl", // fix for single urls not working
-  //   crawlerOptions: crawlerOptions,
-  //   team_id: team_id,
-  //   pageOptions: pageOptions,
-  //   origin: req.body.origin ?? defaultOrigin,
-  // });
+  const searchResults = await search({
+    query: `site:${req.body.url}`,
+    advanced: false,
+    num_results: 50,
+    lang: "en",
+    country: "us",
+    location: "United States",
+  })

-  // await logCrawl(job.id.toString(), team_id);
+  if (searchResults.length > 0) {
+    searchResults.map(x => { links.push(x.url); });
+  }

-  // res.json({ jobId: job.id });
+  links = links.map(x => checkAndUpdateURL(x).url);
+  links = [...new Set(links)];
+
+  return res.status(200).json({
+    success: true,
+    links
+  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {

 export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
  return {
+    includeMarkdown: x.formats.includes("markdown"),
    includeHtml: x.formats.includes("html"),
    includeRawHtml: x.formats.includes("rawHtml"),
    onlyIncludeTags: x.includeTags,
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -11,6 +11,7 @@ export interface Progress {
 }

 export type PageOptions = {
+  includeMarkdown?: boolean;
  onlyMainContent?: boolean;
  includeHtml?: boolean;
  includeRawHtml?: boolean;
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -123,6 +123,7 @@ export async function scrapSingleUrl(
  jobId: string,
  urlToScrap: string,
  pageOptions: PageOptions = {
+    includeMarkdown: true,
    onlyMainContent: true,
    includeHtml: false,
    includeRawHtml: false,
@ -370,7 +371,7 @@ export async function scrapSingleUrl(
    if (screenshot && screenshot.length > 0) {
      document = {
        content: text,
-        markdown: text,
+        markdown: pageOptions.includeMarkdown ? text : undefined,
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
@ -389,7 +390,7 @@ export async function scrapSingleUrl(
    } else {
      document = {
        content: text,
-        markdown: text,
+        markdown: pageOptions.includeMarkdown ? text : undefined,
        html: pageOptions.includeHtml ? html : undefined,
        rawHtml:
          pageOptions.includeRawHtml ||
@ -416,7 +417,7 @@ export async function scrapSingleUrl(
    });
    return {
      content: "",
-      markdown: "",
+      markdown: pageOptions.includeMarkdown ? "" : undefined,
      html: "",
      linksOnPage: pageOptions.includeLinks ? [] : undefined,
      metadata: {
--- a/apps/api/src/search/fireEngine.ts
+++ b/apps/api/src/search/fireEngine.ts
@ -4,42 +4,41 @@ import { SearchResult } from "../../src/lib/entities";

 dotenv.config();

-export async function serper_search(q, options: {
+export async function fireEngineSearch(q: string, options: {
    tbs?: string;
    filter?: string;
    lang?: string;
    country?: string;
    location?: string;
-    num_results: number;
+    numResults: number;
    page?: number;
 }): Promise<SearchResult[]> {
  let data = JSON.stringify({
    q: q,
-    hl: options.lang,
-    gl: options.country,
+    lang: options.lang,
+    country: options.country,
    location: options.location,
    tbs: options.tbs,
-    num: options.num_results,
+    num: options.numResults,
    page: options.page ?? 1,
  });

+  if (!process.env.FIRE_ENGINE_BETA_URL) {
+    return [];
+  }
+
  let config = {
    method: "POST",
-    url: "https://google.serper.dev/search",
+    url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
    headers: {
-      "X-API-KEY": process.env.SERPER_API_KEY,
      "Content-Type": "application/json",
    },
    data: data,
  };
  const response = await axios(config);
-  if (response && response.data && Array.isArray(response.data.organic)) {
-    return response.data.organic.map((a) => ({
-      url: a.link,
-      title: a.title,
-      description: a.snippet,
-    }));
-  }else{
+  if (response && response.data) {
+    return response.data
+  } else {
    return [];
  }
 }
--- a/apps/api/src/search/googlesearch.ts
+++ b/apps/api/src/search/googlesearch.ts
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string



-export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
+export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
    let proxies = null;
    if (proxy) {
        if (proxy.startsWith("https")) {
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -1,10 +1,7 @@
 import { Logger } from "../../src/lib/logger";
 import { SearchResult } from "../../src/lib/entities";
-import { google_search } from "./googlesearch";
-import { serper_search } from "./serper";
-
-
-
+import { googleSearch } from "./googlesearch";
+import { fireEngineSearch } from "./fireEngine";

 export async function search({
  query,
@ -32,10 +29,10 @@ export async function search({
  timeout?: number;
 }) : Promise<SearchResult[]> {
  try {
-    if (process.env.SERPER_API_KEY ) {
-      return await serper_search(query, {num_results, tbs, filter, lang, country, location});
+    if (process.env.FIRE_ENGINE_BETA_URL) {
+      return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
    }
-    return await google_search(
+    return await googleSearch(
      query,
      advanced,
      num_results,
@ -51,5 +48,4 @@ export async function search({
    Logger.error(`Error in search function: ${error}`);
    return []
  }
-  // if process.env.SERPER_API_KEY is set, use serper
 }
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -15,7 +15,6 @@ x-common-service: &common-service
    - OPENAI_BASE_URL=${OPENAI_BASE_URL}
    - MODEL_NAME=${MODEL_NAME:-gpt-4o}
    - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
-    - SERPER_API_KEY=${SERPER_API_KEY}
    - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
    - LOGTAIL_KEY=${LOGTAIL_KEY}
    - BULL_AUTH_KEY=${BULL_AUTH_KEY}
--- a/examples/kubernetes/cluster-install/secret.yaml
+++ b/examples/kubernetes/cluster-install/secret.yaml
@ -6,7 +6,6 @@ type: Opaque
 data:
  OPENAI_API_KEY: ""
  SLACK_WEBHOOK_URL: ""
-  SERPER_API_KEY: ""
  LLAMAPARSE_API_KEY: ""
  LOGTAIL_KEY: ""
  BULL_AUTH_KEY: ""