From 01751525770cf646751c34cba9e560495877e775 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 2 Jul 2024 11:25:17 -0300
Subject: [PATCH] Fixed PDF match custom scraping

Now it's working for both `https://getgc.ai/privacy` and `https://prairie.cards/products/wood-designs` usecases.
---
 .../WebScraper/custom/handleCustomScraping.ts | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
index d78d815..f8b2503 100644
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@@ -29,22 +29,23 @@ export async function handleCustomScraping(
     };
   }
 
-  // Check for Google Drive PDF links in the raw HTML
-  const googleDrivePdfPattern =
-    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
-  const googleDrivePdfLink = url.match(googleDrivePdfPattern);
-  if (googleDrivePdfLink) {
-    console.log(
-      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
-    );
+  // Check for Google Drive PDF links in meta tags
+  const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
+  const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
+  if (googleDriveMetaMatch) {
+    const url = googleDriveMetaMatch[1];
+    console.log(`Google Drive PDF link detected: ${url}`);
 
-    const fileId = googleDrivePdfLink[1];
-    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
+    const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
+    if (fileIdMatch) {
+      const fileId = fileIdMatch[1];
+      const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
 
-    return {
-      scraper: "pdf",
-      url: pdfUrl
-    };
+      return {
+        scraper: "pdf",
+        url: pdfUrl
+      };
+    }
   }
   
   return null;