fix(WebCrawler): filter out file URLs when taking URLs from sitemap

This commit is contained in:
Gergo Moricz 2024-07-18 21:49:37 +02:00
parent 95c6c63b85
commit f0e95ce399
2 changed files with 3 additions and 2 deletions

View File

@ -383,7 +383,7 @@ export class WebCrawler {
return linkDomain === baseDomain;
}
private isFile(url: string): boolean {
public isFile(url: string): boolean {
const fileExtensions = [
".png",
".jpg",

View File

@ -2,6 +2,7 @@ import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler";
export async function getLinksFromSitemap(
{
@ -41,7 +42,7 @@ export async function getLinksFromSitemap(
}
} else if (root && root.url) {
for (const url of root.url) {
if (url.loc && url.loc.length > 0) {
if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
allUrls.push(url.loc[0]);
}
}