Merge pull request #55 from mendableai/feat/blocklist-social-media

[Feat] Added blocklist for social media urls
This commit is contained in:
Nicolas 2024-04-23 16:52:17 -07:00 committed by GitHub
commit 6a1c7d48ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 101 additions and 0 deletions

View File

@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401); expect(response.statusCode).not.toBe(401);
}); });
it("should return an error for a blocklisted URL without requiring authorization", async () => {
const blocklistedUrl = "https://facebook.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response", async () => { it("should return a successful response", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401); expect(response.statusCode).not.toBe(401);
}); });
it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://twitter.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawl")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response", async () => { it("should return a successful response", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401); expect(response.statusCode).not.toBe(401);
}); });
it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://instagram.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response", async () => { it("should return a successful response", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview") .post("/v0/crawlWebsitePreview")

View File

@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002";
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://facebook.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response with a valid preview token", async () => { it("should return a successful response with a valid preview token", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")
@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://twitter.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002";
); );
}); });
// Additional tests for insufficient credits? // Additional tests for insufficient credits?
}); });
@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://instagram.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview") .post("/v0/crawlWebsitePreview")

View File

@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) {
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
} }
if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };

View File

@ -2,6 +2,7 @@ import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
} }
if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };

View File

@ -5,6 +5,7 @@ import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
export async function scrapeHelper( export async function scrapeHelper(
req: Request, req: Request,
@ -22,6 +23,10 @@ export async function scrapeHelper(
return { success: false, error: "Url is required", returnCode: 400 }; return { success: false, error: "Url is required", returnCode: 400 };
} }
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",

View File

@ -0,0 +1,19 @@
const socialMediaBlocklist = [
'facebook.com',
'twitter.com',
'instagram.com',
'linkedin.com',
'pinterest.com',
'snapchat.com',
'tiktok.com',
'reddit.com',
'tumblr.com',
'flickr.com',
'whatsapp.com',
'wechat.com',
'telegram.org',
];
export function isUrlBlocked(url: string): boolean {
return socialMediaBlocklist.some(domain => url.includes(domain));
}