From 01751525770cf646751c34cba9e560495877e775 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:25:17 -0300 Subject: [PATCH] Fixed PDF match custom scraping Now it's working for both `https://getgc.ai/privacy` and `https://prairie.cards/products/wood-designs` usecases. --- .../WebScraper/custom/handleCustomScraping.ts | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index d78d815..f8b2503 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -29,22 +29,23 @@ export async function handleCustomScraping( }; } - // Check for Google Drive PDF links in the raw HTML - const googleDrivePdfPattern = - /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/; - const googleDrivePdfLink = url.match(googleDrivePdfPattern); - if (googleDrivePdfLink) { - console.log( - `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}` - ); + // Check for Google Drive PDF links in meta tags + const googleDriveMetaPattern = /