Fixed PDF match custom scraping

Now it's working for both `https://getgc.ai/privacy` and `https://prairie.cards/products/wood-designs` usecases.
2024-07-02 11:25:17 -03:00 · 2024-07-02 11:25:17 -03:00 · 0175152577
parent 96de948d6b
commit 0175152577
1 changed files with 15 additions and 14 deletions
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -29,22 +29,23 @@ export async function handleCustomScraping(
    };
  }

-  // Check for Google Drive PDF links in the raw HTML
-  const googleDrivePdfPattern =
-    /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
-  const googleDrivePdfLink = url.match(googleDrivePdfPattern);
-  if (googleDrivePdfLink) {
-    console.log(
-      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
-    );
+  // Check for Google Drive PDF links in meta tags
+  const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
+  const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
+  if (googleDriveMetaMatch) {
+    const url = googleDriveMetaMatch[1];
+    console.log(`Google Drive PDF link detected: ${url}`);

-    const fileId = googleDrivePdfLink[1];
-    const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
+    const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
+    if (fileIdMatch) {
+      const fileId = fileIdMatch[1];
+      const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;

-    return {
-      scraper: "pdf",
-      url: pdfUrl
-    };
+      return {
+        scraper: "pdf",
+        url: pdfUrl
+      };
+    }
  }
  
  return null;