fix(crawl): submit sitemapped jobs in bulk

This commit is contained in:
Gergő Móricz 2024-08-14 20:34:19 +02:00
parent 2ca1017fc3
commit b8ec40dd72
2 changed files with 39 additions and 15 deletions

View File

@ -10,7 +10,8 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -115,20 +116,31 @@ export async function crawlController(req: Request, res: Response) {
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) { const jobs = sitemap.map(x => {
await lockURL(id, sc, url); const url = x.url;
const job = await addScrapeJob({ const uuid = uuidv4();
url, return {
mode: "single_urls", name: uuid,
crawlerOptions: crawlerOptions, data: {
team_id: team_id, url,
pageOptions: pageOptions, mode: "single_urls",
origin: req.body.origin ?? defaultOrigin, crawlerOptions: crawlerOptions,
crawl_id: id, team_id: team_id,
sitemapped: true, pageOptions: pageOptions,
}); origin: req.body.origin ?? defaultOrigin,
await addCrawlJob(id, job.id); crawl_id: id,
} sitemapped: true,
},
opts: {
jobId: uuid,
priority: 2,
}
};
})
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await getScrapeQueue().addBulk(jobs);
} else { } else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const job = await addScrapeJob({ const job = await addScrapeJob({

View File

@ -30,6 +30,11 @@ export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
} }
export async function addCrawlJobs(id: string, job_ids: string[]) {
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobDone(id: string, job_id: string) { export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
@ -54,6 +59,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
return res; return res;
} }
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,