feat(sdk/rust): first batch of changes for 1.0.0

This commit is contained in:
Gergő Móricz 2024-09-19 22:22:57 +02:00
parent 6aa468163e
commit 93a20442e3
9 changed files with 808 additions and 311 deletions

5
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,5 @@
{
"rust-analyzer.linkedProjects": [
"apps/rust-sdk/Cargo.toml"
]
}

229
apps/rust-sdk/Cargo.lock generated
View File

@ -26,6 +26,21 @@ dependencies = [
"memchr",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "arrayref"
version = "0.3.7"
@ -151,6 +166,19 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"android-tzdata",
"iana-time-zone",
"num-traits",
"serde",
"windows-targets 0.52.6",
]
[[package]]
name = "clippy"
version = "0.0.302"
@ -197,6 +225,51 @@ version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "darling"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
"serde",
]
[[package]]
name = "diff"
version = "0.1.13"
@ -215,10 +288,10 @@ dependencies = [
]
[[package]]
name = "dotenv"
version = "0.15.0"
name = "dotenvy"
version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
[[package]]
name = "encoding_rs"
@ -276,16 +349,17 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]]
name = "firecrawl"
version = "0.1.0"
version = "1.0.0"
dependencies = [
"assert_matches",
"clippy",
"dotenv",
"dotenvy",
"log 0.4.22",
"reqwest",
"rustfmt",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"uuid",
@ -426,13 +500,19 @@ dependencies = [
"futures-core",
"futures-sink",
"http",
"indexmap",
"indexmap 2.2.6",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.14.5"
@ -445,6 +525,12 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "hex"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "http"
version = "1.1.0"
@ -558,6 +644,35 @@ dependencies = [
"tracing",
]
[[package]]
name = "iana-time-zone"
version = "0.1.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.5.0"
@ -568,6 +683,17 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "indexmap"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg 1.3.0",
"hashbrown 0.12.3",
"serde",
]
[[package]]
name = "indexmap"
version = "2.2.6"
@ -575,7 +701,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
dependencies = [
"equivalent",
"hashbrown",
"hashbrown 0.14.5",
"serde",
]
[[package]]
@ -701,6 +828,12 @@ dependencies = [
"tempfile",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
@ -846,6 +979,12 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "proc-macro2"
version = "1.0.86"
@ -1293,6 +1432,36 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_with"
version = "3.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857"
dependencies = [
"base64 0.22.1",
"chrono",
"hex",
"indexmap 1.9.3",
"indexmap 2.2.6",
"serde",
"serde_derive",
"serde_json",
"serde_with_macros",
"time",
]
[[package]]
name = "serde_with_macros"
version = "3.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.2"
@ -1342,6 +1511,12 @@ dependencies = [
"log 0.3.9",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
@ -1489,6 +1664,37 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "time"
version = "0.3.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
dependencies = [
"deranged",
"itoa",
"num-conv",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.7.0"
@ -1843,6 +2049,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.48.0"

View File

@ -1,13 +1,13 @@
[package]
name = "firecrawl"
author="Mendable.ai"
version = "0.1.0"
author= "Mendable.ai"
version = "1.0.0"
edition = "2021"
license = "GPL-2.0-or-later"
license = "GPL-3.0-or-later"
homepage = "https://www.firecrawl.dev/"
repository ="https://github.com/mendableai/firecrawl"
description = "Rust SDK for Firecrawl API."
authors = ["sanix-darker <sanixdk@gmail.com>"]
authors = ["Gergő Móricz <mogery@firecrawl.dev>", "sanix-darker <sanixdk@gmail.com>"]
[lib]
path = "src/lib.rs"
@ -18,6 +18,7 @@ name = "firecrawl"
reqwest = { version = "^0.12", features = ["json", "blocking"] }
serde = { version = "^1.0", features = ["derive"] }
serde_json = "^1.0"
serde_with = "^3.9"
log = "^0.4"
thiserror = "^1.0"
uuid = { version = "^1.10", features = ["v4"] }
@ -27,7 +28,7 @@ tokio = { version = "^1", features = ["full"] }
clippy = "^0.0.302"
rustfmt = "^0.10"
assert_matches = "^1.5"
dotenv = "^0.15"
dotenvy = "^0.15"
tokio = { version = "1", features = ["full"] }
[build-dependencies]

297
apps/rust-sdk/src/crawl.rs Normal file
View File

@ -0,0 +1,297 @@
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
pub enum CrawlScrapeFormats {
/// Will result in a copy of the Markdown content of the page.
#[serde(rename = "markdown")]
Markdown,
/// Will result in a copy of the filtered, content-only HTML of the page.
#[serde(rename = "html")]
HTML,
/// Will result in a copy of the raw HTML of the page.
#[serde(rename = "rawHtml")]
RawHTML,
/// Will result in a Vec of URLs found on the page.
#[serde(rename = "links")]
Links,
/// Will result in a URL to a screenshot of the page.
///
/// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
#[serde(rename = "screenshot")]
Screenshot,
/// Will result in a URL to a full-page screenshot of the page.
///
/// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
#[serde(rename = "screenshot@fullPage")]
ScreenshotFullPage,
}
impl From<CrawlScrapeFormats> for ScrapeFormats {
fn from(value: CrawlScrapeFormats) -> Self {
match value {
CrawlScrapeFormats::Markdown => Self::Markdown,
CrawlScrapeFormats::HTML => Self::HTML,
CrawlScrapeFormats::RawHTML => Self::RawHTML,
CrawlScrapeFormats::Links => Self::Links,
CrawlScrapeFormats::Screenshot => Self::Screenshot,
CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
}
}
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct CrawlScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`)
pub formats: Option<Vec<CrawlScrapeFormats>>,
/// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
pub only_main_content: Option<bool>,
/// HTML tags to exclusively include.
///
/// For example, if you pass `div`, you will only get content from `<div>`s and their children.
pub include_tags: Option<Vec<String>>,
/// HTML tags to exclude.
///
/// For example, if you pass `img`, you will never get image URLs in your results.
pub exclude_tags: Option<Vec<String>>,
/// Additional HTTP headers to use when loading the page.
pub headers: Option<HashMap<String, String>>,
// Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
pub wait_for: Option<u32>,
// Timeout before returning an error, in milliseconds. (default: `60000`)
pub timeout: Option<u32>,
}
impl From<CrawlScrapeOptions> for ScrapeOptions {
fn from(value: CrawlScrapeOptions) -> Self {
ScrapeOptions {
formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
only_main_content: value.only_main_content,
include_tags: value.include_tags,
exclude_tags: value.exclude_tags,
headers: value.headers,
wait_for: value.wait_for,
timeout: value.timeout,
..Default::default()
}
}
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct CrawlOptions {
/// Options to pass through to the scraper.
pub scrape_options: Option<CrawlScrapeOptions>,
/// URL RegEx patterns to (exclusively) include.
///
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
pub include_paths: Option<String>,
/// URL RegEx patterns to exclude.
///
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
pub exclude_paths: Option<String>,
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
pub max_depth: Option<u32>,
/// Tells the crawler to ignore the sitemap when crawling. (default: `true`)
pub ignore_sitemap: Option<bool>,
/// Maximum number of pages to crawl. (default: `10`)
pub limit: Option<u32>,
/// Allows the crawler to navigate links that are backwards in the URL hierarchy. (default: `false`)
pub allow_backward_links: Option<bool>,
/// Allows the crawler to follow links to external URLs. (default: `false`)
pub allow_external_links: Option<bool>,
/// URL to send Webhook crawl events to.
pub webhook: Option<String>,
/// Idempotency key to send to the crawl endpoint.
#[serde(skip)]
pub idempotency_key: Option<String>,
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
#[serde(skip)]
pub poll_interval: Option<u64>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct CrawlRequestBody {
url: String,
#[serde(flatten)]
options: CrawlOptions,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct CrawlResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`.
/// No need to expose.
success: bool,
/// The resulting document.
data: Document,
}
#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
#[serde(rename_all = "camelCase")]
pub enum CrawlStatusTypes {
/// The crawl job is in progress.
Scraping,
/// The crawl job has been completed successfully.
Completed,
/// The crawl job has failed.
Failed,
/// The crawl job has been cancelled.
Cancelled,
}
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct CrawlStatus {
/// The status of the crawl.
pub status: CrawlStatusTypes,
/// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
pub total: u32,
/// Number of pages that have been successfully scraped.
pub completed: u32,
/// Amount of credits used by the crawl job.
pub credits_used: u32,
/// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
pub expires_at: String, // TODO: parse into date
/// URL to call to get the next batch of documents.
/// Unless you are sidestepping the SDK, you do not need to deal with this.
pub next: Option<String>,
/// List of documents returned by the crawl
pub data: Vec<Document>,
}
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct CrawlAsyncResponse {
success: bool,
/// Crawl ID
pub id: String,
/// URL to get the status of the crawl job
pub url: String,
}
impl FirecrawlApp {
pub async fn crawl_url_async(
&self,
url: impl AsRef<str>,
options: Option<CrawlOptions>,
) -> Result<CrawlAsyncResponse, FirecrawlError> {
let body = CrawlRequestBody {
url: url.as_ref().to_string(),
options: options.unwrap_or_default(),
};
let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
let response = self
.client
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
.headers(headers.clone())
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
}
pub async fn crawl_url(
&self,
url: impl AsRef<str>,
options: Option<CrawlOptions>,
) -> Result<Vec<Document>, FirecrawlError> {
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
let res = self.crawl_url_async(url, options).await?;
self.monitor_job_status(&res.id, poll_interval).await
}
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
let response = self
.client
.get(&format!(
"{}{}/crawl/{}",
self.api_url, API_VERSION, id
))
.headers(self.prepare_headers(None))
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
self.handle_response(response, "check crawl status").await
}
async fn monitor_job_status(
&self,
id: &str,
poll_interval: u64,
) -> Result<Vec<Document>, FirecrawlError> {
loop {
let status_data = self.check_crawl_status(id).await?;
match status_data.status {
CrawlStatusTypes::Completed => {
return Ok(status_data.data);
}
CrawlStatusTypes::Scraping => {
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
}
CrawlStatusTypes::Failed => {
return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job failed."
)));
}
CrawlStatusTypes::Cancelled => {
return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job was cancelled."
)));
}
}
}
}
}

View File

@ -0,0 +1,86 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct DocumentMetadata {
// firecrawl specific
#[serde(rename = "sourceURL")]
pub source_url: String,
pub status_code: u16,
pub error: Option<String>,
// basic meta tags
pub title: String,
pub description: String,
pub language: Option<String>,
pub keywords: Option<String>,
pub robots: Option<String>,
// og: namespace
pub og_title: Option<String>,
pub og_description: Option<String>,
pub og_url: Option<String>,
pub og_image: Option<String>,
pub og_audio: Option<String>,
pub og_determiner: Option<String>,
pub og_locale: Option<String>,
pub og_locale_alternate: Option<String>,
pub og_site_name: Option<String>,
pub og_video: Option<String>,
// article: namespace
pub article_section: Option<String>,
pub article_tag: Option<String>,
pub published_time: Option<String>,
pub modified_time: Option<String>,
// dc./dcterms. namespace
pub dcterms_keywords: Option<String>,
pub dc_description: Option<String>,
pub dc_subject: Option<String>,
pub dcterms_subject: Option<String>,
pub dcterms_audience: Option<String>,
pub dc_type: Option<String>,
pub dcterms_type: Option<String>,
pub dc_date: Option<String>,
pub dc_date_created: Option<String>,
pub dcterms_created: Option<String>,
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct Document {
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
pub markdown: Option<String>,
/// The HTML of the page, present if `ScrapeFormats::HTML` is present in `ScrapeOptions.formats`.
///
/// This contains HTML that has non-content tags removed. If you need the original HTML, use `ScrapeFormats::RawHTML`.
pub html: Option<String>,
/// The raw HTML of the page, present if `ScrapeFormats::RawHTML` is present in `ScrapeOptions.formats`.
///
/// This contains the original, untouched HTML on the page. If you only need human-readable content, use `ScrapeFormats::HTML`.
pub raw_html: Option<String>,
/// The URL to the screenshot of the page, present if `ScrapeFormats::Screenshot` or `ScrapeFormats::ScreenshotFullPage` is present in `ScrapeOptions.formats`.
pub screenshot: Option<String>,
/// A list of the links on the page, present if `ScrapeFormats::Links` is present in `ScrapeOptions.formats`.
pub links: Option<Vec<String>>,
/// The extracted data from the page, present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
/// If `ScrapeOptions.extract.schema` is `Some`, this `Value` is guaranteed to match the provided schema.
pub extract: Option<Value>,
/// The metadata from the page.
pub metadata: DocumentMetadata,
/// Can be present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
/// The warning message will contain any errors encountered during the extraction.
pub warning: Option<String>,
}

View File

@ -0,0 +1,29 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
use thiserror::Error;
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct FirecrawlAPIError {
/// Always false.
success: bool,
/// Error message
pub error: String,
/// Additional details of this error. Schema depends on the error itself.
pub details: Option<Value>,
}
#[derive(Error, Debug)]
pub enum FirecrawlError {
#[error("HTTP request failed: {0}")]
HttpRequestFailed(String),
#[error("API key not provided")]
APIKeyNotProvided,
#[error("Failed to parse response: {0}")]
ResponseParseError(String),
#[error("API error")]
APIError(FirecrawlAPIError),
#[error("Crawl job failed or stopped: {0}")]
CrawlJobFailed(String),
}

View File

@ -1,40 +1,14 @@
/*
*
* - Structs and Enums:
* FirecrawlError: Custom error enum for handling various errors.
* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client.
*
* - Initialization:
*
* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided.
*
* - API Methods:
* scrape_url, search, crawl_url, check_crawl_status:
* Methods for interacting with the Firecrawl API, similar to the Python methods.
* monitor_job_status: Polls the API to monitor the status of a crawl job until completion.
*/
use std::env;
use std::thread;
use std::time::Duration;
use log::debug;
use reqwest::{Client, Response};
use serde::de::DeserializeOwned;
use serde_json::json;
use serde_json::Value;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum FirecrawlError {
#[error("HTTP request failed: {0}")]
HttpRequestFailed(String),
#[error("API key not provided")]
ApiKeyNotProvided,
#[error("Failed to parse response: {0}")]
ResponseParseError(String),
#[error("Crawl job failed or stopped: {0}")]
CrawlJobFailed(String),
}
pub mod crawl;
pub mod document;
mod error;
pub mod scrape;
pub use error::FirecrawlError;
#[derive(Clone, Debug)]
pub struct FirecrawlApp {
@ -42,26 +16,15 @@ pub struct FirecrawlApp {
api_url: String,
client: Client,
}
// the api verstion of firecrawl
const API_VERSION: &str = "/v0";
pub(crate) const API_VERSION: &str = "/v1";
impl FirecrawlApp {
/// Initialize the FirecrawlApp instance.
///
/// # Arguments:
/// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API.
/// * `api_url` (Optional[str]): Base URL for the Firecrawl API.
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
let api_key = api_key
.or_else(|| env::var("FIRECRAWL_API_KEY").ok())
.ok_or(FirecrawlError::ApiKeyNotProvided)?;
let api_url = api_url.unwrap_or_else(|| {
env::var("FIRECRAWL_API_URL")
.unwrap_or_else(|_| "https://api.firecrawl.dev".to_string())
});
debug!("Initialized FirecrawlApp with API key: {}", api_key);
debug!("Initialized FirecrawlApp with API URL: {}", api_url);
.ok_or(FirecrawlError::APIKeyNotProvided)?;
let api_url = api_url
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
Ok(FirecrawlApp {
api_key,
@ -70,237 +33,7 @@ impl FirecrawlApp {
})
}
/// Scrape the specified URL using the Firecrawl API.
///
/// # Arguments:
/// * `url` (str): The URL to scrape.
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
///
/// # Returns:
/// * `Any`: The scraped data if the request is successful.
///
/// # Raises:
/// * `Exception`: If the scrape request fails.
pub async fn scrape_url(
&self,
url: &str,
params: Option<Value>,
) -> Result<Value, FirecrawlError> {
let headers = self.prepare_headers(None);
let mut scrape_params = json!({"url": url});
if let Some(mut params) = params {
if let Some(extractor_options) = params.get_mut("extractorOptions") {
if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") {
if extraction_schema.is_object() && extraction_schema.get("schema").is_some() {
extractor_options["extractionSchema"] = extraction_schema["schema"].clone();
}
extractor_options["mode"] = extractor_options
.get("mode")
.cloned()
.unwrap_or_else(|| json!("llm-extraction"));
}
scrape_params["extractorOptions"] = extractor_options.clone();
}
for (key, value) in params.as_object().unwrap() {
if key != "extractorOptions" {
scrape_params[key] = value.clone();
}
}
}
let response = self
.client
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
.headers(headers)
.json(&scrape_params)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
self.handle_response(response, "scrape URL").await
}
/// Perform a search using the Firecrawl API.
///
/// # Arguments:
/// * `query` (str): The search query.
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request.
///
/// # Returns:
/// * `Any`: The search results if the request is successful.
///
/// # Raises:
/// * `Exception`: If the search request fails.
pub async fn search(
&self,
query: &str,
params: Option<Value>,
) -> Result<Value, FirecrawlError> {
let headers = self.prepare_headers(None);
let mut json_data = json!({"query": query});
if let Some(params) = params {
for (key, value) in params.as_object().unwrap() {
json_data[key] = value.clone();
}
}
let response = self
.client
.post(&format!("{}{}/search", self.api_url, API_VERSION))
.headers(headers)
.json(&json_data)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
self.handle_response(response, "search").await
}
/// Initiate a crawl job for the specified URL using the Firecrawl API.
///
/// # Arguments:
/// * `url` (str): The URL to crawl.
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
/// * `wait_until_done` (bool): Whether to wait until the crawl job is completed.
/// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion.
/// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests.
///
/// # Returns:
/// * `Any`: The crawl job ID or the crawl results if waiting until completion.
///
/// # `Raises`:
/// * `Exception`: If the crawl job initiation or monitoring fails.
pub async fn crawl_url(
&self,
url: &str,
params: Option<Value>,
wait_until_done: bool,
poll_interval: u64,
idempotency_key: Option<String>,
) -> Result<Value, FirecrawlError> {
let headers = self.prepare_headers(idempotency_key);
let mut json_data = json!({"url": url});
if let Some(params) = params {
for (key, value) in params.as_object().unwrap() {
json_data[key] = value.clone();
}
}
let response = self
.client
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
.headers(headers.clone())
.json(&json_data)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
let response_json = self.handle_response(response, "start crawl job").await?;
let job_id = response_json["jobId"].as_str().unwrap().to_string();
if wait_until_done {
self.monitor_job_status(&job_id, headers, poll_interval)
.await
} else {
Ok(json!({"jobId": job_id}))
}
}
/// Check the status of a crawl job using the Firecrawl API.
///
/// # Arguments:
/// * `job_id` (str): The ID of the crawl job.
///
/// # Returns:
/// * `Any`: The status of the crawl job.
///
/// # Raises:
/// * `Exception`: If the status check request fails.
pub async fn check_crawl_status(&self, job_id: &str) -> Result<Value, FirecrawlError> {
let headers = self.prepare_headers(None);
let response = self
.client
.get(&format!(
"{}{}/crawl/status/{}",
self.api_url, API_VERSION, job_id
))
.headers(headers)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
self.handle_response(response, "check crawl status").await
}
/// Monitor the status of a crawl job until completion.
///
/// # Arguments:
/// * `job_id` (str): The ID of the crawl job.
/// * `headers` (Dict[str, str]): The headers to include in the status check requests.
/// * `poll_interval` (int): Secounds between status checks.
///
/// # Returns:
/// * `Any`: The crawl results if the job is completed successfully.
///
/// # Raises:
/// Exception: If the job fails or an error occurs during status checks.
async fn monitor_job_status(
&self,
job_id: &str,
headers: reqwest::header::HeaderMap,
poll_interval: u64,
) -> Result<Value, FirecrawlError> {
loop {
let response = self
.client
.get(&format!(
"{}{}/crawl/status/{}",
self.api_url, API_VERSION, job_id
))
.headers(headers.clone())
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
let status_data = self.handle_response(response, "check crawl status").await?;
match status_data["status"].as_str() {
Some("completed") => {
if status_data["data"].is_object() {
return Ok(status_data["data"].clone());
} else {
return Err(FirecrawlError::CrawlJobFailed(
"Crawl job completed but no data was returned".to_string(),
));
}
}
Some("active") | Some("paused") | Some("pending") | Some("queued")
| Some("waiting") => {
thread::sleep(Duration::from_secs(poll_interval));
}
Some(status) => {
return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job failed or was stopped. Status: {}",
status
)));
}
None => {
return Err(FirecrawlError::CrawlJobFailed(
"Unexpected response: no status field".to_string(),
));
}
}
}
}
/// Prepare the headers for API requests.
///
/// # Arguments:
/// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests.
///
/// # Returns:
/// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
fn prepare_headers(&self, idempotency_key: Option<String>) -> reqwest::header::HeaderMap {
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
let mut headers = reqwest::header::HeaderMap::new();
headers.insert("Content-Type", "application/json".parse().unwrap());
headers.insert(
@ -313,30 +46,22 @@ impl FirecrawlApp {
headers
}
/// Handle errors from API responses.
///
/// # Arguments:
/// * `response` (requests.Response): The response object from the API request.
/// * `action` (str): Description of the action that was being performed.
///
/// # Raises:
/// Exception: An exception with a message containing the status code and error details from the response.
async fn handle_response(
async fn handle_response<'a, T: DeserializeOwned>(
&self,
response: Response,
action: &str,
) -> Result<Value, FirecrawlError> {
action: impl AsRef<str>,
) -> Result<T, FirecrawlError> {
if response.status().is_success() {
let response_json: Value = response
.json()
.await
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
if response_json["success"].as_bool().unwrap_or(false) {
Ok(response_json["data"].clone())
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
} else {
Err(FirecrawlError::HttpRequestFailed(format!(
"Failed to {}: {}",
action, response_json["error"]
action.as_ref(), response_json["error"]
)))
}
} else {
@ -348,23 +73,23 @@ impl FirecrawlApp {
let message = match status_code {
402 => format!(
"Payment Required: Failed to {}. {}",
action, error_message["error"]
action.as_ref(), error_message["error"]
),
408 => format!(
"Request Timeout: Failed to {} as the request timed out. {}",
action, error_message["error"]
action.as_ref(), error_message["error"]
),
409 => format!(
"Conflict: Failed to {} due to a conflict. {}",
action, error_message["error"]
action.as_ref(), error_message["error"]
),
500 => format!(
"Internal Server Error: Failed to {}. {}",
action, error_message["error"]
action.as_ref(), error_message["error"]
),
_ => format!(
"Unexpected error during {}: Status code {}. {}",
action, status_code, error_message["error"]
action.as_ref(), status_code, error_message["error"]
),
};
Err(FirecrawlError::HttpRequestFailed(message))

139
apps/rust-sdk/src/scrape.rs Normal file
View File

@ -0,0 +1,139 @@
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
pub enum ScrapeFormats {
/// Will result in a copy of the Markdown content of the page.
#[serde(rename = "markdown")]
Markdown,
/// Will result in a copy of the filtered, content-only HTML of the page.
#[serde(rename = "html")]
HTML,
/// Will result in a copy of the raw HTML of the page.
#[serde(rename = "rawHtml")]
RawHTML,
/// Will result in a Vec of URLs found on the page.
#[serde(rename = "links")]
Links,
/// Will result in a URL to a screenshot of the page.
///
/// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
#[serde(rename = "screenshot")]
Screenshot,
/// Will result in a URL to a full-page screenshot of the page.
///
/// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
#[serde(rename = "screenshot@fullPage")]
ScreenshotFullPage,
/// Will result in the results of an LLM extraction.
///
/// See `ScrapeOptions.extract` for more options.
#[serde(rename = "extract")]
Extract,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct ExtractOptions {
/// Schema the output should adhere to, provided in JSON Schema format.
pub schema: Option<Value>,
pub system_prompt: Option<Value>,
/// Extraction prompt to send to the LLM agent along with the page content.
pub prompt: Option<Value>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct ScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`)
pub formats: Option<Vec<ScrapeFormats>>,
/// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
pub only_main_content: Option<bool>,
/// HTML tags to exclusively include.
///
/// For example, if you pass `div`, you will only get content from `<div>`s and their children.
pub include_tags: Option<Vec<String>>,
/// HTML tags to exclude.
///
/// For example, if you pass `img`, you will never get image URLs in your results.
pub exclude_tags: Option<Vec<String>>,
/// Additional HTTP headers to use when loading the page.
pub headers: Option<HashMap<String, String>>,
// Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
pub wait_for: Option<u32>,
// Timeout before returning an error, in milliseconds. (default: `60000`)
pub timeout: Option<u32>,
/// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`.
pub extract: Option<ExtractOptions>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct ScrapeRequestBody {
url: String,
#[serde(flatten)]
options: ScrapeOptions,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct ScrapeResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`.
/// No need to expose.
success: bool,
/// The resulting document.
data: Document,
}
impl FirecrawlApp {
pub async fn scrape_url(
&self,
url: impl AsRef<str>,
options: Option<ScrapeOptions>,
) -> Result<Document, FirecrawlError> {
let body = ScrapeRequestBody {
url: url.as_ref().to_string(),
options: options.unwrap_or_default(),
};
let headers = self.prepare_headers(None);
let response = self
.client
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
.headers(headers)
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
Ok(response.data)
}
}

View File

@ -1,5 +1,5 @@
use assert_matches::assert_matches;
use dotenv::dotenv;
use dotenvy::dotenv;
use firecrawl::FirecrawlApp;
use serde_json::json;
use std::env;