diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 354a5cb..e112cd4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import { removeUnwantedElements } from "./utils/removeUnwantedElements"; import axios from "axios"; dotenv.config(); @@ -313,44 +313,6 @@ export async function scrapSingleUrl( ): Promise { urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { - const soup = cheerio.load(html); - soup("script, style, iframe, noscript, meta, head").remove(); - - if (pageOptions.removeTags) { - if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags = [pageOptions.removeTags]; - } - - if (Array.isArray(pageOptions.removeTags)) { - pageOptions.removeTags.forEach((tag) => { - let elementsToRemove; - if (tag.startsWith("*") && tag.endsWith("*")) { - const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); - elementsToRemove = soup('*').filter((index, element) => { - const classNames = soup(element).attr('class'); - return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); - }); - } else { - elementsToRemove = soup(tag); - } - - elementsToRemove.remove(); - }); - } - } - - if (pageOptions.onlyMainContent) { - // remove any other tags that are not in the main content - excludeNonMainTags.forEach((tag) => { - const elementsToRemove = soup(tag); - elementsToRemove.remove(); - }); - } - const cleanedHtml = soup.html(); - return cleanedHtml; -}; - const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts new file mode 100644 index 0000000..cfa49e7 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -0,0 +1,63 @@ +import { removeUnwantedElements } from "../removeUnwantedElements"; +import { PageOptions } from "../../../../lib/entities"; + +describe('removeUnwantedElements', () => { + it('should remove script, style, iframe, noscript, meta, and head tags', () => { + const html = `Test
Content
`; + const options: PageOptions = {}; + const result = removeUnwantedElements(html, options); + expect(result).not.toContain('