From 93a20442e31ecee405e495a8544ae80e2cb4d2c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 19 Sep 2024 22:22:57 +0200 Subject: [PATCH] feat(sdk/rust): first batch of changes for 1.0.0 --- .vscode/settings.json | 5 + apps/rust-sdk/Cargo.lock | 229 ++++++++++++++++++- apps/rust-sdk/Cargo.toml | 11 +- apps/rust-sdk/src/crawl.rs | 297 +++++++++++++++++++++++++ apps/rust-sdk/src/document.rs | 86 +++++++ apps/rust-sdk/src/error.rs | 29 +++ apps/rust-sdk/src/lib.rs | 321 ++------------------------- apps/rust-sdk/src/scrape.rs | 139 ++++++++++++ apps/rust-sdk/tests/e2e_with_auth.rs | 2 +- 9 files changed, 808 insertions(+), 311 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 apps/rust-sdk/src/crawl.rs create mode 100644 apps/rust-sdk/src/document.rs create mode 100644 apps/rust-sdk/src/error.rs create mode 100644 apps/rust-sdk/src/scrape.rs diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9d2a5d8 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "rust-analyzer.linkedProjects": [ + "apps/rust-sdk/Cargo.toml" + ] +} \ No newline at end of file diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index c2b71d6..bf12821 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -26,6 +26,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "arrayref" version = "0.3.7" @@ -151,6 +166,19 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "serde", + "windows-targets 0.52.6", +] + [[package]] name = "clippy" version = "0.0.302" @@ -197,6 +225,51 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + [[package]] name = "diff" version = "0.1.13" @@ -215,10 +288,10 @@ dependencies = [ ] [[package]] -name = "dotenv" -version = "0.15.0" +name = "dotenvy" +version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" [[package]] name = "encoding_rs" @@ -276,16 +349,17 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "firecrawl" -version = "0.1.0" +version = "1.0.0" dependencies = [ "assert_matches", "clippy", - "dotenv", + "dotenvy", "log 0.4.22", "reqwest", "rustfmt", "serde", "serde_json", + "serde_with", "thiserror", "tokio", "uuid", @@ -426,13 +500,19 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap", + "indexmap 2.2.6", "slab", "tokio", "tokio-util", "tracing", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" @@ -445,6 +525,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "1.1.0" @@ -558,6 +644,35 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.5.0" @@ -568,6 +683,17 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg 1.3.0", + "hashbrown 0.12.3", + "serde", +] + [[package]] name = "indexmap" version = "2.2.6" @@ -575,7 +701,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.14.5", + "serde", ] [[package]] @@ -701,6 +828,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-traits" version = "0.2.19" @@ -846,6 +979,12 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "proc-macro2" version = "1.0.86" @@ -1293,6 +1432,36 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" +dependencies = [ + "base64 0.22.1", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.2.6", + "serde", + "serde_derive", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -1342,6 +1511,12 @@ dependencies = [ "log 0.3.9", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" @@ -1489,6 +1664,37 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "time" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinyvec" version = "1.7.0" @@ -1843,6 +2049,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 685545e..c28aff5 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,13 +1,13 @@ [package] name = "firecrawl" -author="Mendable.ai" -version = "0.1.0" +author= "Mendable.ai" +version = "1.0.0" edition = "2021" -license = "GPL-2.0-or-later" +license = "GPL-3.0-or-later" homepage = "https://www.firecrawl.dev/" repository ="https://github.com/mendableai/firecrawl" description = "Rust SDK for Firecrawl API." -authors = ["sanix-darker "] +authors = ["Gergő Móricz ", "sanix-darker "] [lib] path = "src/lib.rs" @@ -18,6 +18,7 @@ name = "firecrawl" reqwest = { version = "^0.12", features = ["json", "blocking"] } serde = { version = "^1.0", features = ["derive"] } serde_json = "^1.0" +serde_with = "^3.9" log = "^0.4" thiserror = "^1.0" uuid = { version = "^1.10", features = ["v4"] } @@ -27,7 +28,7 @@ tokio = { version = "^1", features = ["full"] } clippy = "^0.0.302" rustfmt = "^0.10" assert_matches = "^1.5" -dotenv = "^0.15" +dotenvy = "^0.15" tokio = { version = "1", features = ["full"] } [build-dependencies] diff --git a/apps/rust-sdk/src/crawl.rs b/apps/rust-sdk/src/crawl.rs new file mode 100644 index 0000000..cf211bf --- /dev/null +++ b/apps/rust-sdk/src/crawl.rs @@ -0,0 +1,297 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION}; + +#[derive(Deserialize, Serialize, Clone, Copy, Debug)] +pub enum CrawlScrapeFormats { + /// Will result in a copy of the Markdown content of the page. + #[serde(rename = "markdown")] + Markdown, + + /// Will result in a copy of the filtered, content-only HTML of the page. + #[serde(rename = "html")] + HTML, + + /// Will result in a copy of the raw HTML of the page. + #[serde(rename = "rawHtml")] + RawHTML, + + /// Will result in a Vec of URLs found on the page. + #[serde(rename = "links")] + Links, + + /// Will result in a URL to a screenshot of the page. + /// + /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`. + #[serde(rename = "screenshot")] + Screenshot, + + /// Will result in a URL to a full-page screenshot of the page. + /// + /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`. + #[serde(rename = "screenshot@fullPage")] + ScreenshotFullPage, +} + +impl From for ScrapeFormats { + fn from(value: CrawlScrapeFormats) -> Self { + match value { + CrawlScrapeFormats::Markdown => Self::Markdown, + CrawlScrapeFormats::HTML => Self::HTML, + CrawlScrapeFormats::RawHTML => Self::RawHTML, + CrawlScrapeFormats::Links => Self::Links, + CrawlScrapeFormats::Screenshot => Self::Screenshot, + CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage, + } + } +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct CrawlScrapeOptions { + /// Formats to extract from the page. (default: `[ Markdown ]`) + pub formats: Option>, + + /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`) + pub only_main_content: Option, + + /// HTML tags to exclusively include. + /// + /// For example, if you pass `div`, you will only get content from `
`s and their children. + pub include_tags: Option>, + + /// HTML tags to exclude. + /// + /// For example, if you pass `img`, you will never get image URLs in your results. + pub exclude_tags: Option>, + + /// Additional HTTP headers to use when loading the page. + pub headers: Option>, + + // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`) + pub wait_for: Option, + + // Timeout before returning an error, in milliseconds. (default: `60000`) + pub timeout: Option, +} + +impl From for ScrapeOptions { + fn from(value: CrawlScrapeOptions) -> Self { + ScrapeOptions { + formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()), + only_main_content: value.only_main_content, + include_tags: value.include_tags, + exclude_tags: value.exclude_tags, + headers: value.headers, + wait_for: value.wait_for, + timeout: value.timeout, + ..Default::default() + } + } +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct CrawlOptions { + /// Options to pass through to the scraper. + pub scrape_options: Option, + + /// URL RegEx patterns to (exclusively) include. + /// + /// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled. + pub include_paths: Option, + + /// URL RegEx patterns to exclude. + /// + /// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled. + pub exclude_paths: Option, + + /// Maximum URL depth to crawl, relative to the base URL. (default: `2`) + pub max_depth: Option, + + /// Tells the crawler to ignore the sitemap when crawling. (default: `true`) + pub ignore_sitemap: Option, + + /// Maximum number of pages to crawl. (default: `10`) + pub limit: Option, + + /// Allows the crawler to navigate links that are backwards in the URL hierarchy. (default: `false`) + pub allow_backward_links: Option, + + /// Allows the crawler to follow links to external URLs. (default: `false`) + pub allow_external_links: Option, + + /// URL to send Webhook crawl events to. + pub webhook: Option, + + /// Idempotency key to send to the crawl endpoint. + #[serde(skip)] + pub idempotency_key: Option, + + /// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`) + #[serde(skip)] + pub poll_interval: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +struct CrawlRequestBody { + url: String, + + #[serde(flatten)] + options: CrawlOptions, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +struct CrawlResponse { + /// This will always be `true` due to `FirecrawlApp::handle_response`. + /// No need to expose. + success: bool, + + /// The resulting document. + data: Document, +} + +#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)] +#[serde(rename_all = "camelCase")] +pub enum CrawlStatusTypes { + /// The crawl job is in progress. + Scraping, + + /// The crawl job has been completed successfully. + Completed, + + /// The crawl job has failed. + Failed, + + /// The crawl job has been cancelled. + Cancelled, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct CrawlStatus { + /// The status of the crawl. + pub status: CrawlStatusTypes, + + /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages. + pub total: u32, + + /// Number of pages that have been successfully scraped. + pub completed: u32, + + /// Amount of credits used by the crawl job. + pub credits_used: u32, + + /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API. + pub expires_at: String, // TODO: parse into date + + /// URL to call to get the next batch of documents. + /// Unless you are sidestepping the SDK, you do not need to deal with this. + pub next: Option, + + /// List of documents returned by the crawl + pub data: Vec, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct CrawlAsyncResponse { + success: bool, + + /// Crawl ID + pub id: String, + + /// URL to get the status of the crawl job + pub url: String, +} + +impl FirecrawlApp { + pub async fn crawl_url_async( + &self, + url: impl AsRef, + options: Option, + ) -> Result { + let body = CrawlRequestBody { + url: url.as_ref().to_string(), + options: options.unwrap_or_default(), + }; + + let headers = self.prepare_headers(body.options.idempotency_key.as_ref()); + + let response = self + .client + .post(&format!("{}{}/crawl", self.api_url, API_VERSION)) + .headers(headers.clone()) + .json(&body) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + self.handle_response::(response, "start crawl job").await + } + + pub async fn crawl_url( + &self, + url: impl AsRef, + options: Option, + ) -> Result, FirecrawlError> { + let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000); + + let res = self.crawl_url_async(url, options).await?; + + self.monitor_job_status(&res.id, poll_interval).await + } + + pub async fn check_crawl_status(&self, id: &str) -> Result { + let response = self + .client + .get(&format!( + "{}{}/crawl/{}", + self.api_url, API_VERSION, id + )) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + self.handle_response(response, "check crawl status").await + } + + async fn monitor_job_status( + &self, + id: &str, + poll_interval: u64, + ) -> Result, FirecrawlError> { + loop { + let status_data = self.check_crawl_status(id).await?; + match status_data.status { + CrawlStatusTypes::Completed => { + return Ok(status_data.data); + } + CrawlStatusTypes::Scraping => { + tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await; + } + CrawlStatusTypes::Failed => { + return Err(FirecrawlError::CrawlJobFailed(format!( + "Crawl job failed." + ))); + } + CrawlStatusTypes::Cancelled => { + return Err(FirecrawlError::CrawlJobFailed(format!( + "Crawl job was cancelled." + ))); + } + } + } + } +} diff --git a/apps/rust-sdk/src/document.rs b/apps/rust-sdk/src/document.rs new file mode 100644 index 0000000..5eba5df --- /dev/null +++ b/apps/rust-sdk/src/document.rs @@ -0,0 +1,86 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct DocumentMetadata { + // firecrawl specific + #[serde(rename = "sourceURL")] + pub source_url: String, + pub status_code: u16, + pub error: Option, + + // basic meta tags + pub title: String, + pub description: String, + pub language: Option, + pub keywords: Option, + pub robots: Option, + + // og: namespace + pub og_title: Option, + pub og_description: Option, + pub og_url: Option, + pub og_image: Option, + pub og_audio: Option, + pub og_determiner: Option, + pub og_locale: Option, + pub og_locale_alternate: Option, + pub og_site_name: Option, + pub og_video: Option, + + // article: namespace + pub article_section: Option, + pub article_tag: Option, + pub published_time: Option, + pub modified_time: Option, + + // dc./dcterms. namespace + pub dcterms_keywords: Option, + pub dc_description: Option, + pub dc_subject: Option, + pub dcterms_subject: Option, + pub dcterms_audience: Option, + pub dc_type: Option, + pub dcterms_type: Option, + pub dc_date: Option, + pub dc_date_created: Option, + pub dcterms_created: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct Document { + /// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default) + pub markdown: Option, + + /// The HTML of the page, present if `ScrapeFormats::HTML` is present in `ScrapeOptions.formats`. + /// + /// This contains HTML that has non-content tags removed. If you need the original HTML, use `ScrapeFormats::RawHTML`. + pub html: Option, + + /// The raw HTML of the page, present if `ScrapeFormats::RawHTML` is present in `ScrapeOptions.formats`. + /// + /// This contains the original, untouched HTML on the page. If you only need human-readable content, use `ScrapeFormats::HTML`. + pub raw_html: Option, + + /// The URL to the screenshot of the page, present if `ScrapeFormats::Screenshot` or `ScrapeFormats::ScreenshotFullPage` is present in `ScrapeOptions.formats`. + pub screenshot: Option, + + /// A list of the links on the page, present if `ScrapeFormats::Links` is present in `ScrapeOptions.formats`. + pub links: Option>, + + /// The extracted data from the page, present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`. + /// If `ScrapeOptions.extract.schema` is `Some`, this `Value` is guaranteed to match the provided schema. + pub extract: Option, + + /// The metadata from the page. + pub metadata: DocumentMetadata, + + /// Can be present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`. + /// The warning message will contain any errors encountered during the extraction. + pub warning: Option, +} + diff --git a/apps/rust-sdk/src/error.rs b/apps/rust-sdk/src/error.rs new file mode 100644 index 0000000..a6d11eb --- /dev/null +++ b/apps/rust-sdk/src/error.rs @@ -0,0 +1,29 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use thiserror::Error; + +#[derive(Debug, Deserialize, Serialize, Clone)] +pub struct FirecrawlAPIError { + /// Always false. + success: bool, + + /// Error message + pub error: String, + + /// Additional details of this error. Schema depends on the error itself. + pub details: Option, +} + +#[derive(Error, Debug)] +pub enum FirecrawlError { + #[error("HTTP request failed: {0}")] + HttpRequestFailed(String), + #[error("API key not provided")] + APIKeyNotProvided, + #[error("Failed to parse response: {0}")] + ResponseParseError(String), + #[error("API error")] + APIError(FirecrawlAPIError), + #[error("Crawl job failed or stopped: {0}")] + CrawlJobFailed(String), +} diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs index a2ca75a..6c519a2 100644 --- a/apps/rust-sdk/src/lib.rs +++ b/apps/rust-sdk/src/lib.rs @@ -1,40 +1,14 @@ -/* -* -* - Structs and Enums: -* FirecrawlError: Custom error enum for handling various errors. -* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client. -* -* - Initialization: -* -* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided. -* -* - API Methods: -* scrape_url, search, crawl_url, check_crawl_status: -* Methods for interacting with the Firecrawl API, similar to the Python methods. -* monitor_job_status: Polls the API to monitor the status of a crawl job until completion. -*/ - -use std::env; -use std::thread; -use std::time::Duration; - -use log::debug; use reqwest::{Client, Response}; +use serde::de::DeserializeOwned; use serde_json::json; use serde_json::Value; -use thiserror::Error; -#[derive(Error, Debug)] -pub enum FirecrawlError { - #[error("HTTP request failed: {0}")] - HttpRequestFailed(String), - #[error("API key not provided")] - ApiKeyNotProvided, - #[error("Failed to parse response: {0}")] - ResponseParseError(String), - #[error("Crawl job failed or stopped: {0}")] - CrawlJobFailed(String), -} +pub mod crawl; +pub mod document; +mod error; +pub mod scrape; + +pub use error::FirecrawlError; #[derive(Clone, Debug)] pub struct FirecrawlApp { @@ -42,26 +16,15 @@ pub struct FirecrawlApp { api_url: String, client: Client, } -// the api verstion of firecrawl -const API_VERSION: &str = "/v0"; + +pub(crate) const API_VERSION: &str = "/v1"; impl FirecrawlApp { - /// Initialize the FirecrawlApp instance. - /// - /// # Arguments: - /// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API. - /// * `api_url` (Optional[str]): Base URL for the Firecrawl API. pub fn new(api_key: Option, api_url: Option) -> Result { let api_key = api_key - .or_else(|| env::var("FIRECRAWL_API_KEY").ok()) - .ok_or(FirecrawlError::ApiKeyNotProvided)?; - let api_url = api_url.unwrap_or_else(|| { - env::var("FIRECRAWL_API_URL") - .unwrap_or_else(|_| "https://api.firecrawl.dev".to_string()) - }); - - debug!("Initialized FirecrawlApp with API key: {}", api_key); - debug!("Initialized FirecrawlApp with API URL: {}", api_url); + .ok_or(FirecrawlError::APIKeyNotProvided)?; + let api_url = api_url + .unwrap_or_else(|| "https://api.firecrawl.dev".to_string()); Ok(FirecrawlApp { api_key, @@ -70,237 +33,7 @@ impl FirecrawlApp { }) } - /// Scrape the specified URL using the Firecrawl API. - /// - /// # Arguments: - /// * `url` (str): The URL to scrape. - /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request. - /// - /// # Returns: - /// * `Any`: The scraped data if the request is successful. - /// - /// # Raises: - /// * `Exception`: If the scrape request fails. - pub async fn scrape_url( - &self, - url: &str, - params: Option, - ) -> Result { - let headers = self.prepare_headers(None); - let mut scrape_params = json!({"url": url}); - - if let Some(mut params) = params { - if let Some(extractor_options) = params.get_mut("extractorOptions") { - if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") { - if extraction_schema.is_object() && extraction_schema.get("schema").is_some() { - extractor_options["extractionSchema"] = extraction_schema["schema"].clone(); - } - extractor_options["mode"] = extractor_options - .get("mode") - .cloned() - .unwrap_or_else(|| json!("llm-extraction")); - } - scrape_params["extractorOptions"] = extractor_options.clone(); - } - for (key, value) in params.as_object().unwrap() { - if key != "extractorOptions" { - scrape_params[key] = value.clone(); - } - } - } - - let response = self - .client - .post(&format!("{}{}/scrape", self.api_url, API_VERSION)) - .headers(headers) - .json(&scrape_params) - .send() - .await - .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; - - self.handle_response(response, "scrape URL").await - } - - /// Perform a search using the Firecrawl API. - /// - /// # Arguments: - /// * `query` (str): The search query. - /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request. - /// - /// # Returns: - /// * `Any`: The search results if the request is successful. - /// - /// # Raises: - /// * `Exception`: If the search request fails. - pub async fn search( - &self, - query: &str, - params: Option, - ) -> Result { - let headers = self.prepare_headers(None); - let mut json_data = json!({"query": query}); - if let Some(params) = params { - for (key, value) in params.as_object().unwrap() { - json_data[key] = value.clone(); - } - } - - let response = self - .client - .post(&format!("{}{}/search", self.api_url, API_VERSION)) - .headers(headers) - .json(&json_data) - .send() - .await - .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; - - self.handle_response(response, "search").await - } - - /// Initiate a crawl job for the specified URL using the Firecrawl API. - /// - /// # Arguments: - /// * `url` (str): The URL to crawl. - /// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - /// * `wait_until_done` (bool): Whether to wait until the crawl job is completed. - /// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion. - /// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests. - /// - /// # Returns: - /// * `Any`: The crawl job ID or the crawl results if waiting until completion. - /// - /// # `Raises`: - /// * `Exception`: If the crawl job initiation or monitoring fails. - pub async fn crawl_url( - &self, - url: &str, - params: Option, - wait_until_done: bool, - poll_interval: u64, - idempotency_key: Option, - ) -> Result { - let headers = self.prepare_headers(idempotency_key); - let mut json_data = json!({"url": url}); - if let Some(params) = params { - for (key, value) in params.as_object().unwrap() { - json_data[key] = value.clone(); - } - } - - let response = self - .client - .post(&format!("{}{}/crawl", self.api_url, API_VERSION)) - .headers(headers.clone()) - .json(&json_data) - .send() - .await - .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; - - let response_json = self.handle_response(response, "start crawl job").await?; - let job_id = response_json["jobId"].as_str().unwrap().to_string(); - - if wait_until_done { - self.monitor_job_status(&job_id, headers, poll_interval) - .await - } else { - Ok(json!({"jobId": job_id})) - } - } - - /// Check the status of a crawl job using the Firecrawl API. - /// - /// # Arguments: - /// * `job_id` (str): The ID of the crawl job. - /// - /// # Returns: - /// * `Any`: The status of the crawl job. - /// - /// # Raises: - /// * `Exception`: If the status check request fails. - pub async fn check_crawl_status(&self, job_id: &str) -> Result { - let headers = self.prepare_headers(None); - let response = self - .client - .get(&format!( - "{}{}/crawl/status/{}", - self.api_url, API_VERSION, job_id - )) - .headers(headers) - .send() - .await - .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; - - self.handle_response(response, "check crawl status").await - } - - /// Monitor the status of a crawl job until completion. - /// - /// # Arguments: - /// * `job_id` (str): The ID of the crawl job. - /// * `headers` (Dict[str, str]): The headers to include in the status check requests. - /// * `poll_interval` (int): Secounds between status checks. - /// - /// # Returns: - /// * `Any`: The crawl results if the job is completed successfully. - /// - /// # Raises: - /// Exception: If the job fails or an error occurs during status checks. - async fn monitor_job_status( - &self, - job_id: &str, - headers: reqwest::header::HeaderMap, - poll_interval: u64, - ) -> Result { - loop { - let response = self - .client - .get(&format!( - "{}{}/crawl/status/{}", - self.api_url, API_VERSION, job_id - )) - .headers(headers.clone()) - .send() - .await - .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; - - let status_data = self.handle_response(response, "check crawl status").await?; - match status_data["status"].as_str() { - Some("completed") => { - if status_data["data"].is_object() { - return Ok(status_data["data"].clone()); - } else { - return Err(FirecrawlError::CrawlJobFailed( - "Crawl job completed but no data was returned".to_string(), - )); - } - } - Some("active") | Some("paused") | Some("pending") | Some("queued") - | Some("waiting") => { - thread::sleep(Duration::from_secs(poll_interval)); - } - Some(status) => { - return Err(FirecrawlError::CrawlJobFailed(format!( - "Crawl job failed or was stopped. Status: {}", - status - ))); - } - None => { - return Err(FirecrawlError::CrawlJobFailed( - "Unexpected response: no status field".to_string(), - )); - } - } - } - } - - /// Prepare the headers for API requests. - /// - /// # Arguments: - /// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests. - /// - /// # Returns: - /// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. - fn prepare_headers(&self, idempotency_key: Option) -> reqwest::header::HeaderMap { + fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap { let mut headers = reqwest::header::HeaderMap::new(); headers.insert("Content-Type", "application/json".parse().unwrap()); headers.insert( @@ -313,30 +46,22 @@ impl FirecrawlApp { headers } - /// Handle errors from API responses. - /// - /// # Arguments: - /// * `response` (requests.Response): The response object from the API request. - /// * `action` (str): Description of the action that was being performed. - /// - /// # Raises: - /// Exception: An exception with a message containing the status code and error details from the response. - async fn handle_response( + async fn handle_response<'a, T: DeserializeOwned>( &self, response: Response, - action: &str, - ) -> Result { + action: impl AsRef, + ) -> Result { if response.status().is_success() { let response_json: Value = response .json() .await .map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?; if response_json["success"].as_bool().unwrap_or(false) { - Ok(response_json["data"].clone()) + Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?) } else { Err(FirecrawlError::HttpRequestFailed(format!( "Failed to {}: {}", - action, response_json["error"] + action.as_ref(), response_json["error"] ))) } } else { @@ -348,23 +73,23 @@ impl FirecrawlApp { let message = match status_code { 402 => format!( "Payment Required: Failed to {}. {}", - action, error_message["error"] + action.as_ref(), error_message["error"] ), 408 => format!( "Request Timeout: Failed to {} as the request timed out. {}", - action, error_message["error"] + action.as_ref(), error_message["error"] ), 409 => format!( "Conflict: Failed to {} due to a conflict. {}", - action, error_message["error"] + action.as_ref(), error_message["error"] ), 500 => format!( "Internal Server Error: Failed to {}. {}", - action, error_message["error"] + action.as_ref(), error_message["error"] ), _ => format!( "Unexpected error during {}: Status code {}. {}", - action, status_code, error_message["error"] + action.as_ref(), status_code, error_message["error"] ), }; Err(FirecrawlError::HttpRequestFailed(message)) diff --git a/apps/rust-sdk/src/scrape.rs b/apps/rust-sdk/src/scrape.rs new file mode 100644 index 0000000..4b48162 --- /dev/null +++ b/apps/rust-sdk/src/scrape.rs @@ -0,0 +1,139 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION}; + +#[derive(Deserialize, Serialize, Clone, Copy, Debug)] +pub enum ScrapeFormats { + /// Will result in a copy of the Markdown content of the page. + #[serde(rename = "markdown")] + Markdown, + + /// Will result in a copy of the filtered, content-only HTML of the page. + #[serde(rename = "html")] + HTML, + + /// Will result in a copy of the raw HTML of the page. + #[serde(rename = "rawHtml")] + RawHTML, + + /// Will result in a Vec of URLs found on the page. + #[serde(rename = "links")] + Links, + + /// Will result in a URL to a screenshot of the page. + /// + /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`. + #[serde(rename = "screenshot")] + Screenshot, + + /// Will result in a URL to a full-page screenshot of the page. + /// + /// Can not be used in conjunction with `ScrapeFormats::Screenshot`. + #[serde(rename = "screenshot@fullPage")] + ScreenshotFullPage, + + /// Will result in the results of an LLM extraction. + /// + /// See `ScrapeOptions.extract` for more options. + #[serde(rename = "extract")] + Extract, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct ExtractOptions { + /// Schema the output should adhere to, provided in JSON Schema format. + pub schema: Option, + + pub system_prompt: Option, + + /// Extraction prompt to send to the LLM agent along with the page content. + pub prompt: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +pub struct ScrapeOptions { + /// Formats to extract from the page. (default: `[ Markdown ]`) + pub formats: Option>, + + /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`) + pub only_main_content: Option, + + /// HTML tags to exclusively include. + /// + /// For example, if you pass `div`, you will only get content from `
`s and their children. + pub include_tags: Option>, + + /// HTML tags to exclude. + /// + /// For example, if you pass `img`, you will never get image URLs in your results. + pub exclude_tags: Option>, + + /// Additional HTTP headers to use when loading the page. + pub headers: Option>, + + // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`) + pub wait_for: Option, + + // Timeout before returning an error, in milliseconds. (default: `60000`) + pub timeout: Option, + + /// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`. + pub extract: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +struct ScrapeRequestBody { + url: String, + + #[serde(flatten)] + options: ScrapeOptions, +} + +#[derive(Deserialize, Serialize, Debug, Default)] +#[serde_with::skip_serializing_none] +#[serde(rename_all = "camelCase")] +struct ScrapeResponse { + /// This will always be `true` due to `FirecrawlApp::handle_response`. + /// No need to expose. + success: bool, + + /// The resulting document. + data: Document, +} + +impl FirecrawlApp { + pub async fn scrape_url( + &self, + url: impl AsRef, + options: Option, + ) -> Result { + let body = ScrapeRequestBody { + url: url.as_ref().to_string(), + options: options.unwrap_or_default(), + }; + + let headers = self.prepare_headers(None); + + let response = self + .client + .post(&format!("{}{}/scrape", self.api_url, API_VERSION)) + .headers(headers) + .json(&body) + .send() + .await + .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; + + let response = self.handle_response::(response, "scrape URL").await?; + + Ok(response.data) + } +} diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index ac9dc1d..99b14df 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -1,5 +1,5 @@ use assert_matches::assert_matches; -use dotenv::dotenv; +use dotenvy::dotenv; use firecrawl::FirecrawlApp; use serde_json::json; use std::env;