fix(crawl): submit sitemapped jobs in bulk
This commit is contained in:
parent
2ca1017fc3
commit
b8ec40dd72
|
@ -10,7 +10,8 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||||
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
|
@ -115,20 +116,31 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
for (const url of sitemap.map(x => x.url)) {
|
const jobs = sitemap.map(x => {
|
||||||
await lockURL(id, sc, url);
|
const url = x.url;
|
||||||
const job = await addScrapeJob({
|
const uuid = uuidv4();
|
||||||
url,
|
return {
|
||||||
mode: "single_urls",
|
name: uuid,
|
||||||
crawlerOptions: crawlerOptions,
|
data: {
|
||||||
team_id: team_id,
|
url,
|
||||||
pageOptions: pageOptions,
|
mode: "single_urls",
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
crawlerOptions: crawlerOptions,
|
||||||
crawl_id: id,
|
team_id: team_id,
|
||||||
sitemapped: true,
|
pageOptions: pageOptions,
|
||||||
});
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
await addCrawlJob(id, job.id);
|
crawl_id: id,
|
||||||
}
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 2,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
|
await lockURLs(id, jobs.map(x => x.data.url));
|
||||||
|
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||||
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob({
|
||||||
|
|
|
@ -30,6 +30,11 @@ export async function addCrawlJob(id: string, job_id: string) {
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||||
|
@ -54,6 +59,13 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
|
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||||
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
|
|
Loading…
Reference in New Issue