From a684bd3c5d765e263dfec15aae113339bede4991 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:07:23 -0300 Subject: [PATCH 1/2] added regex for links in sitemap --- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b5364..00d5185 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -64,6 +64,14 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { + + // if link is not a complete url, add the base url + link = link.trim(); + const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i'); + if (!isCompleteUrl.test(link)){ + link = this.baseUrl + link; + } + const url = new URL(link); const path = url.pathname; From 5e728c1a4d1286fb4d2beb6d302d98f731f8ec24 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 24 Jul 2024 08:33:00 -0300 Subject: [PATCH 2/2] Update apps/api/src/scraper/WebScraper/crawler.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit no need for regex Co-authored-by: Gergő Móricz --- apps/api/src/scraper/WebScraper/crawler.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 00d5185..640eada 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -64,15 +64,7 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { - - // if link is not a complete url, add the base url - link = link.trim(); - const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i'); - if (!isCompleteUrl.test(link)){ - link = this.baseUrl + link; - } - - const url = new URL(link); + const url = new URL(link.trim(), this.baseUrl); const path = url.pathname; const depth = getURLDepth(url.toString());