diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts new file mode 100644 index 0000000..0a9931d --- /dev/null +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -0,0 +1,64 @@ +import { url } from "../types"; + +describe("URL Schema Validation", () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + it("should prepend http:// to URLs without a protocol", () => { + const result = url.parse("example.com"); + expect(result).toBe("http://example.com"); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("http://example.com")).not.toThrow(); + expect(() => url.parse("https://example.com")).not.toThrow(); + }); + + it("should allow valid URLs with http or https", () => { + expect(() => url.parse("example.com")).not.toThrow(); + }); + + it("should reject URLs with unsupported protocols", () => { + expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL"); + }); + + it("should reject URLs without a valid top-level domain", () => { + expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path"); + }); + + it("should reject blocked URLs", () => { + expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with subdomains correctly", () => { + expect(() => url.parse("http://sub.example.com")).not.toThrow(); + expect(() => url.parse("https://blog.example.com")).not.toThrow(); + }); + + it("should handle URLs with paths correctly", () => { + expect(() => url.parse("http://example.com/path")).not.toThrow(); + expect(() => url.parse("https://example.com/another/path")).not.toThrow(); + }); + + it("should handle URLs with subdomains that are blocked", () => { + expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should handle URLs with paths that are blocked", () => { + expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + + it("should reject malformed URLs starting with 'http://http'", () => { + expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol."); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://example.com/http://example.com")).not.toThrow(); + }); + + it("should reject malformed URLs containing multiple 'http://'", () => { + expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); + }); +}) \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 8f92d46..bcfca1f 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -2,6 +2,7 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { PageOptions } from "../../lib/entities"; +import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; export type Format = | "markdown" @@ -11,17 +12,12 @@ export type Format = | "screenshot" | "screenshot@fullPage"; -const url = z.preprocess( +export const url = z.preprocess( (x) => { - if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) { - if (x.startsWith("://")) { - return "http" + x; - } else { - return "http://" + x; - } - } else { - return x; + if (!protocolIncluded(x as string)) { + return `http://${x}`; } + return x; }, z .string() @@ -32,7 +28,11 @@ const url = z.preprocess( "URL must have a valid top-level domain or be a valid path" ) .refine( - (x) => !isUrlBlocked(x), + (x) => checkUrl(x as string), + "Invalid URL" + ) + .refine( + (x) => !isUrlBlocked(x as string), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." ) ); diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index bb83a8f..14a74de 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -1,4 +1,4 @@ -const protocolIncluded = (url: string) => { +export const protocolIncluded = (url: string) => { // if :// not in the start of the url assume http (maybe https?) // regex checks if :// appears before any . return /^([^.:]+:\/\/)/.test(url); @@ -35,6 +35,25 @@ export const checkAndUpdateURL = (url: string) => { return { urlObj: typedUrlObj, url: url }; }; +export const checkUrl = (url: string) => { + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + if ((url.split(".")[0].match(/:/g) || []).length !== 1) { + throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com + } + + return url; +}; + /** * Same domain check * It checks if the domain of the url is the same as the base url