fix(go-sdk): submodules

This commit is contained in:
rafaelsideguide 2024-08-26 11:11:34 -03:00
parent 96e91ab9ec
commit 1baba3ce0a
15 changed files with 10 additions and 1281 deletions

8
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "apps/go-sdk/firecrawl"]
path = apps/go-sdk/firecrawl
[submodule "apps/go-sdk/firecrawl-go"]
path = apps/go-sdk/firecrawl-go
url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"]
path = apps/go-sdk/examples
[submodule "apps/go-sdk/firecrawl-go-examples"]
path = apps/go-sdk/firecrawl-go-examples
url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -1,12 +1,16 @@
### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer fc
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
content-type: application/json
{
"url":"firecrawl.dev"
"url":"corterix.com"
}
### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
### Check Job Status
GET http://localhost:3002/v0/jobs/active HTTP/1.1

View File

@ -1,25 +0,0 @@
# If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
#
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
# Go workspace file
go.work
go.work.sum
# env file
.env

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 Mendable
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,87 +0,0 @@
package main
import (
"encoding/json"
"fmt"
"log"
"github.com/google/uuid"
"github.com/mendableai/firecrawl-go"
)
func main() {
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
if err != nil {
log.Fatalf("Failed to create FirecrawlApp: %v", err)
}
// Scrape a website
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
if err != nil {
log.Fatalf("Failed to scrape URL: %v", err)
}
fmt.Println(scrapeResult.Markdown)
// Crawl a website
idempotencyKey := uuid.New().String() // optional idempotency key
crawlParams := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
if err != nil {
log.Fatalf("Failed to crawl URL: %v", err)
}
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
if err != nil {
log.Fatalf("Failed to marshal crawl result: %v", err)
}
fmt.Println(string(jsonCrawlResult))
// LLM Extraction using JSON schema
jsonSchema := map[string]any{
"type": "object",
"properties": map[string]any{
"top": map[string]any{
"type": "array",
"items": map[string]any{
"type": "object",
"properties": map[string]any{
"title": map[string]string{"type": "string"},
"points": map[string]string{"type": "number"},
"by": map[string]string{"type": "string"},
"commentsURL": map[string]string{"type": "string"},
},
"required": []string{"title", "points", "by", "commentsURL"},
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News",
},
},
"required": []string{"top"},
}
llmExtractionParams := map[string]any{
"extractorOptions": firecrawl.ExtractorOptions{
ExtractionSchema: jsonSchema,
Mode: "llm-extraction",
},
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
if err != nil {
log.Fatalf("Failed to perform LLM extraction: %v", err)
}
// Pretty print the LLM extraction result
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
if err != nil {
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
}
fmt.Println(string(jsonResult))
}

View File

@ -1,9 +0,0 @@
module github.com/mendableai/firecrawl-go-examples
go 1.22.5
replace github.com/mendableai/firecrawl => ../
require github.com/google/uuid v1.6.0
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect

View File

@ -1,14 +0,0 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -1,2 +0,0 @@
API_URL=http://localhost:3002
TEST_API_KEY=fc-YOUR-API-KEY

View File

@ -1,2 +0,0 @@
.env
vendor

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 Sideguide Technologies Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,189 +0,0 @@
# Firecrawl Go SDK
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
## Installation
To install the Firecrawl Go SDK, you can
```bash
go get github.com/mendableai/firecrawl
```
## Usage
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
Here's an example of how to use the SDK with error handling:
```go
import (
"fmt"
"log"
"github.com/mendableai/firecrawl/firecrawl"
)
func main() {
// Initialize the FirecrawlApp with your API key
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
if err != nil {
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
}
// Scrape a single URL
url := "https://mendable.ai"
scrapedData, err := app.ScrapeURL(url, nil)
if err != nil {
log.Fatalf("Error occurred while scraping: %v", err)
}
fmt.Println(scrapedData)
// Crawl a website
crawlUrl := "https://mendable.ai"
params := map[string]any{
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
crawlResult, err := app.CrawlURL(crawlUrl, params)
if err != nil {
log.Fatalf("Error occurred while crawling: %v", err)
}
fmt.Println(crawlResult)
}
```
### Scraping a URL
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```go
url := "https://mendable.ai"
scrapedData, err := app.ScrapeURL(url, nil)
if err != nil {
log.Fatalf("Failed to scrape URL: %v", err)
}
fmt.Println(scrapedData)
```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
```go
jsonSchema := map[string]any{
"type": "object",
"properties": map[string]any{
"top": map[string]any{
"type": "array",
"items": map[string]any{
"type": "object",
"properties": map[string]any{
"title": map[string]string{"type": "string"},
"points": map[string]string{"type": "number"},
"by": map[string]string{"type": "string"},
"commentsURL": map[string]string{"type": "string"},
},
"required": []string{"title", "points", "by", "commentsURL"},
},
"minItems": 5,
"maxItems": 5,
"description": "Top 5 stories on Hacker News",
},
},
"required": []string{"top"},
}
llmExtractionParams := map[string]any{
"extractorOptions": firecrawl.ExtractorOptions{
ExtractionSchema: jsonSchema,
},
}
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
if err != nil {
log.Fatalf("Failed to perform LLM extraction: %v", err)
}
fmt.Println(scrapeResult)
```
### Search for a query
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
```go
query := "what is mendable?"
searchResult, err := app.Search(query)
if err != nil {
log.Fatalf("Failed to search: %v", err)
}
fmt.Println(searchResult)
```
### Crawling a Website
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
```go
crawlParams := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
"includes": []string{}, // leave empty for all pages
"limit": 1000,
},
"pageOptions": map[string]any{
"onlyMainContent": true,
},
}
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
if err != nil {
log.Fatalf("Failed to crawl URL: %v", err)
}
fmt.Println(crawlResult)
```
### Checking Crawl Status
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```go
status, err := app.CheckCrawlStatus(jobId)
if err != nil {
log.Fatalf("Failed to check crawl status: %v", err)
}
fmt.Println(status)
```
### Canceling a Crawl Job
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
```go
canceled, err := app.CancelCrawlJob(jobId)
if err != nil {
log.Fatalf("Failed to cancel crawl job: %v", err)
}
fmt.Println(canceled)
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
## Contributing
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
## License
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.

View File

@ -1,584 +0,0 @@
// Package firecrawl provides a client for interacting with the Firecrawl API.
package firecrawl
import (
"bytes"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"os"
"time"
)
// FirecrawlDocumentMetadata represents metadata for a Firecrawl document
type FirecrawlDocumentMetadata struct {
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
Keywords string `json:"keywords,omitempty"`
Robots string `json:"robots,omitempty"`
OGTitle string `json:"ogTitle,omitempty"`
OGDescription string `json:"ogDescription,omitempty"`
OGURL string `json:"ogUrl,omitempty"`
OGImage string `json:"ogImage,omitempty"`
OGAudio string `json:"ogAudio,omitempty"`
OGDeterminer string `json:"ogDeterminer,omitempty"`
OGLocale string `json:"ogLocale,omitempty"`
OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"`
OGSiteName string `json:"ogSiteName,omitempty"`
OGVideo string `json:"ogVideo,omitempty"`
DCTermsCreated string `json:"dctermsCreated,omitempty"`
DCDateCreated string `json:"dcDateCreated,omitempty"`
DCDate string `json:"dcDate,omitempty"`
DCTermsType string `json:"dctermsType,omitempty"`
DCType string `json:"dcType,omitempty"`
DCTermsAudience string `json:"dctermsAudience,omitempty"`
DCTermsSubject string `json:"dctermsSubject,omitempty"`
DCSubject string `json:"dcSubject,omitempty"`
DCDescription string `json:"dcDescription,omitempty"`
DCTermsKeywords string `json:"dctermsKeywords,omitempty"`
ModifiedTime string `json:"modifiedTime,omitempty"`
PublishedTime string `json:"publishedTime,omitempty"`
ArticleTag string `json:"articleTag,omitempty"`
ArticleSection string `json:"articleSection,omitempty"`
SourceURL string `json:"sourceURL,omitempty"`
PageStatusCode int `json:"pageStatusCode,omitempty"`
PageError string `json:"pageError,omitempty"`
}
// FirecrawlDocument represents a document in Firecrawl
type FirecrawlDocument struct {
ID string `json:"id,omitempty"`
URL string `json:"url,omitempty"`
Content string `json:"content"`
Markdown string `json:"markdown,omitempty"`
HTML string `json:"html,omitempty"`
LLMExtraction map[string]any `json:"llm_extraction,omitempty"`
CreatedAt *time.Time `json:"createdAt,omitempty"`
UpdatedAt *time.Time `json:"updatedAt,omitempty"`
Type string `json:"type,omitempty"`
Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"`
ChildrenLinks []string `json:"childrenLinks,omitempty"`
Provider string `json:"provider,omitempty"`
Warning string `json:"warning,omitempty"`
Index int `json:"index,omitempty"`
}
// ExtractorOptions represents options for extraction.
type ExtractorOptions struct {
Mode string `json:"mode,omitempty"`
ExtractionPrompt string `json:"extractionPrompt,omitempty"`
ExtractionSchema any `json:"extractionSchema,omitempty"`
}
// ScrapeResponse represents the response for scraping operations
type ScrapeResponse struct {
Success bool `json:"success"`
Data *FirecrawlDocument `json:"data,omitempty"`
}
// SearchResponse represents the response for searching operations
type SearchResponse struct {
Success bool `json:"success"`
Data []*FirecrawlDocument `json:"data,omitempty"`
}
// CrawlResponse represents the response for crawling operations
type CrawlResponse struct {
Success bool `json:"success"`
JobID string `json:"jobId,omitempty"`
Data []*FirecrawlDocument `json:"data,omitempty"`
}
// JobStatusResponse represents the response for checking crawl job status
type JobStatusResponse struct {
Success bool `json:"success"`
Status string `json:"status"`
Current int `json:"current,omitempty"`
CurrentURL string `json:"current_url,omitempty"`
CurrentStep string `json:"current_step,omitempty"`
Total int `json:"total,omitempty"`
JobID string `json:"jobId,omitempty"`
Data []*FirecrawlDocument `json:"data,omitempty"`
PartialData []*FirecrawlDocument `json:"partial_data,omitempty"`
}
// CancelCrawlJobResponse represents the response for canceling a crawl job
type CancelCrawlJobResponse struct {
Success bool `json:"success"`
Status string `json:"status"`
}
// requestOptions represents options for making requests.
type requestOptions struct {
retries int
backoff int
}
// requestOption is a functional option type for requestOptions.
type requestOption func(*requestOptions)
// newRequestOptions creates a new requestOptions instance with the provided options.
//
// Parameters:
// - opts: Optional request options.
//
// Returns:
// - *requestOptions: A new instance of requestOptions with the provided options.
func newRequestOptions(opts ...requestOption) *requestOptions {
options := &requestOptions{retries: 1}
for _, opt := range opts {
opt(options)
}
return options
}
// withRetries sets the number of retries for a request.
//
// Parameters:
// - retries: The number of retries to be performed.
//
// Returns:
// - requestOption: A functional option that sets the number of retries for a request.
func withRetries(retries int) requestOption {
return func(opts *requestOptions) {
opts.retries = retries
}
}
// withBackoff sets the backoff interval for a request.
//
// Parameters:
// - backoff: The backoff interval (in milliseconds) to be used for retries.
//
// Returns:
// - requestOption: A functional option that sets the backoff interval for a request.
func withBackoff(backoff int) requestOption {
return func(opts *requestOptions) {
opts.backoff = backoff
}
}
// FirecrawlApp represents a client for the Firecrawl API.
type FirecrawlApp struct {
APIKey string
APIURL string
Client *http.Client
}
// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL.
// If the API key or API URL is not provided, it attempts to retrieve them from environment variables.
// If the API key is still not found, it returns an error.
//
// Parameters:
// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable.
// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev".
//
// Returns:
// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL.
// - error: An error if the API key is not provided or retrieved.
func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) {
if apiKey == "" {
apiKey = os.Getenv("FIRECRAWL_API_KEY")
if apiKey == "" {
return nil, fmt.Errorf("no API key provided")
}
}
if apiURL == "" {
apiURL = os.Getenv("FIRECRAWL_API_URL")
if apiURL == "" {
apiURL = "https://api.firecrawl.dev"
}
}
client := &http.Client{
Timeout: 60 * time.Second,
}
return &FirecrawlApp{
APIKey: apiKey,
APIURL: apiURL,
Client: client,
}, nil
}
// ScrapeURL scrapes the content of the specified URL using the Firecrawl API.
//
// Parameters:
// - url: The URL to be scraped.
// - params: Optional parameters for the scrape request, including extractor options for LLM extraction.
//
// Returns:
// - *FirecrawlDocument: The scraped document data.
// - error: An error if the scrape request fails.
func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) {
headers := app.prepareHeaders("")
scrapeBody := map[string]any{"url": url}
if params != nil {
if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok {
if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok {
extractorOptions.ExtractionSchema = schema.schema()
}
if extractorOptions.Mode == "" {
extractorOptions.Mode = "llm-extraction"
}
scrapeBody["extractorOptions"] = extractorOptions
}
for key, value := range params {
if key != "extractorOptions" {
scrapeBody[key] = value
}
}
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/scrape", app.APIURL),
scrapeBody,
headers,
"scrape URL",
)
if err != nil {
return nil, err
}
var scrapeResponse ScrapeResponse
err = json.Unmarshal(resp, &scrapeResponse)
if err != nil {
return nil, err
}
if scrapeResponse.Success {
return scrapeResponse.Data, nil
}
return nil, fmt.Errorf("failed to scrape URL")
}
// Search performs a search query using the Firecrawl API and returns the search results.
//
// Parameters:
// - query: The search query string.
// - params: Optional parameters for the search request.
//
// Returns:
// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results.
// - error: An error if the search request fails.
func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) {
headers := app.prepareHeaders("")
searchBody := map[string]any{"query": query}
for k, v := range params {
searchBody[k] = v
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/search", app.APIURL),
searchBody,
headers,
"search",
)
if err != nil {
return nil, err
}
var searchResponse SearchResponse
err = json.Unmarshal(resp, &searchResponse)
if err != nil {
return nil, err
}
if searchResponse.Success {
return searchResponse.Data, nil
}
return nil, fmt.Errorf("failed to search")
}
// CrawlURL starts a crawl job for the specified URL using the Firecrawl API.
//
// Parameters:
// - url: The URL to crawl.
// - params: Optional parameters for the crawl request.
// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning.
// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true.
// - idempotencyKey: An optional idempotency key to ensure the request is idempotent.
//
// Returns:
// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true.
// - error: An error if the crawl request fails.
func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) {
headers := app.prepareHeaders(idempotencyKey)
crawlBody := map[string]any{"url": url}
for k, v := range params {
crawlBody[k] = v
}
resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v0/crawl", app.APIURL),
crawlBody,
headers,
"start crawl job",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var crawlResponse CrawlResponse
err = json.Unmarshal(resp, &crawlResponse)
if err != nil {
return nil, err
}
if waitUntilDone {
return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval)
}
if crawlResponse.JobID == "" {
return nil, fmt.Errorf("failed to get job ID")
}
return crawlResponse.JobID, nil
}
// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to check.
//
// Returns:
// - *JobStatusResponse: The status of the crawl job.
// - error: An error if the crawl status check request fails.
func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) {
headers := app.prepareHeaders("")
resp, err := app.makeRequest(
http.MethodGet,
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
nil,
headers,
"check crawl status",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var jobStatusResponse JobStatusResponse
err = json.Unmarshal(resp, &jobStatusResponse)
if err != nil {
return nil, err
}
return &jobStatusResponse, nil
}
// CancelCrawlJob cancels a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to cancel.
//
// Returns:
// - string: The status of the crawl job after cancellation.
// - error: An error if the crawl job cancellation request fails.
func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) {
headers := app.prepareHeaders("")
resp, err := app.makeRequest(
http.MethodDelete,
fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID),
nil,
headers,
"cancel crawl job",
)
if err != nil {
return "", err
}
var cancelCrawlJobResponse CancelCrawlJobResponse
err = json.Unmarshal(resp, &cancelCrawlJobResponse)
if err != nil {
return "", err
}
return cancelCrawlJobResponse.Status, nil
}
// prepareHeaders prepares the headers for an HTTP request.
//
// Parameters:
// - idempotencyKey: A string representing the idempotency key to be included in the headers.
// If the idempotency key is an empty string, it will not be included in the headers.
//
// Returns:
// - map[string]string: A map containing the headers for the HTTP request.
func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string {
headers := map[string]string{
"Content-Type": "application/json",
"Authorization": fmt.Sprintf("Bearer %s", app.APIKey),
}
if idempotencyKey != "" {
headers["x-idempotency-key"] = idempotencyKey
}
return headers
}
// makeRequest makes a request to the specified URL with the provided method, data, headers, and options.
//
// Parameters:
// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE").
// - url: The URL to send the request to.
// - data: The data to be sent in the request body.
// - headers: The headers to be included in the request.
// - action: A string describing the action being performed.
// - opts: Optional request options.
//
// Returns:
// - []byte: The response body from the request.
// - error: An error if the request fails.
func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) {
var body []byte
var err error
if data != nil {
body, err = json.Marshal(data)
if err != nil {
return nil, err
}
}
req, err := http.NewRequest(method, url, bytes.NewBuffer(body))
if err != nil {
return nil, err
}
for key, value := range headers {
req.Header.Set(key, value)
}
var resp *http.Response
options := newRequestOptions(opts...)
for i := 0; i < options.retries; i++ {
resp, err = app.Client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 502 {
break
}
time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond)
}
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
statusCode := resp.StatusCode
if statusCode != 200 {
return nil, app.handleError(statusCode, respBody, action)
}
return respBody, nil
}
// monitorJobStatus monitors the status of a crawl job using the Firecrawl API.
//
// Parameters:
// - jobID: The ID of the crawl job to monitor.
// - headers: The headers to be included in the request.
// - pollInterval: The interval (in seconds) at which to poll the job status.
//
// Returns:
// - []*FirecrawlDocument: The crawl result if the job is completed.
// - error: An error if the crawl status check request fails.
func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) {
attempts := 0
for {
resp, err := app.makeRequest(
http.MethodGet,
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
nil,
headers,
"check crawl status",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}
var statusData JobStatusResponse
err = json.Unmarshal(resp, &statusData)
if err != nil {
return nil, err
}
status := statusData.Status
if status == "" {
return nil, fmt.Errorf("invalid status in response")
}
if status == "completed" {
if statusData.Data != nil {
return statusData.Data, nil
}
attempts++
if attempts > 3 {
return nil, fmt.Errorf("crawl job completed but no data was returned")
}
} else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" {
pollInterval = max(pollInterval, 2)
time.Sleep(time.Duration(pollInterval) * time.Second)
} else {
return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status)
}
}
}
// handleError handles errors returned by the Firecrawl API.
//
// Parameters:
// - resp: The HTTP response object.
// - body: The response body from the HTTP response.
// - action: A string describing the action being performed.
//
// Returns:
// - error: An error describing the failure reason.
func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error {
var errorData map[string]any
err := json.Unmarshal(body, &errorData)
if err != nil {
return fmt.Errorf("failed to parse error response: %v", err)
}
errorMessage, _ := errorData["error"].(string)
if errorMessage == "" {
errorMessage = "No additional error details provided."
}
var message string
switch statusCode {
case 402:
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
case 408:
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
case 409:
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
case 500:
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
default:
message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage)
}
return fmt.Errorf(message)
}

View File

@ -1,292 +0,0 @@
package firecrawl
import (
"log"
"os"
"testing"
"time"
"github.com/google/uuid"
"github.com/joho/godotenv"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var API_URL string
var TEST_API_KEY string
func init() {
err := godotenv.Load("../.env")
if err != nil {
log.Fatalf("Error loading .env file: %v", err)
}
API_URL = os.Getenv("API_URL")
TEST_API_KEY = os.Getenv("TEST_API_KEY")
}
func TestNoAPIKey(t *testing.T) {
_, err := NewFirecrawlApp("", API_URL)
assert.Error(t, err)
assert.Contains(t, err.Error(), "no API key provided")
}
func TestScrapeURLInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.ScrapeURL("https://firecrawl.dev", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token")
}
func TestBlocklistedURL(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
_, err = app.ScrapeURL("https://facebook.com/fake-test", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
}
func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) {
app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
}
func TestScrapeURLE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
assert.NotEqual(t, response.Markdown, "")
assert.NotNil(t, response.Metadata)
assert.Equal(t, response.HTML, "")
}
func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"pageOptions": map[string]any{
"includeHtml": true,
},
}
response, err := app.ScrapeURL("https://roastmywebsite.ai", params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "_Roast_")
assert.Contains(t, response.Markdown, "_Roast_")
assert.Contains(t, response.HTML, "<h1")
assert.NotNil(t, response.Metadata)
}
func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
assert.NotNil(t, response.Metadata)
}
func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil)
time.Sleep(6 * time.Second) // wait for 6 seconds
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
assert.NotNil(t, response.Metadata)
}
func TestCrawlURLInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token")
}
func TestShouldReturnErrorForBlocklistedURL(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
_, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
}
func TestCrawlURLWaitForCompletionE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
data, ok := response.([]*FirecrawlDocument)
assert.True(t, ok)
assert.Greater(t, len(data), 0)
assert.Contains(t, data[0].Content, "_Roast_")
}
func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
uniqueIdempotencyKey := uuid.New().String()
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, uniqueIdempotencyKey)
require.NoError(t, err)
assert.NotNil(t, response)
data, ok := response.([]*FirecrawlDocument)
assert.True(t, ok)
assert.Greater(t, len(data), 0)
assert.Contains(t, data[0].Content, "_Roast_")
_, err = app.CrawlURL("https://firecrawl.dev", params, true, 2, uniqueIdempotencyKey)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used")
}
func TestCheckCrawlStatusE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"crawlerOptions": map[string]any{
"excludes": []string{"blog/*"},
},
}
response, err := app.CrawlURL("https://firecrawl.dev", params, false, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
jobID, ok := response.(string)
assert.True(t, ok)
assert.NotEqual(t, "", jobID)
time.Sleep(30 * time.Second) // wait for 30 seconds
statusResponse, err := app.CheckCrawlStatus(jobID)
require.NoError(t, err)
assert.NotNil(t, statusResponse)
assert.Equal(t, "completed", statusResponse.Status)
assert.Greater(t, len(statusResponse.Data), 0)
}
func TestSearchE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.Search("test query", nil)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Greater(t, len(response), 2)
assert.NotEqual(t, response[0].Content, "")
}
func TestSearchInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.Search("test query", nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token")
}
func TestLLMExtraction(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
params := map[string]any{
"extractorOptions": ExtractorOptions{
Mode: "llm-extraction",
ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
ExtractionSchema: map[string]any{
"type": "object",
"properties": map[string]any{
"company_mission": map[string]string{"type": "string"},
"supports_sso": map[string]string{"type": "boolean"},
"is_open_source": map[string]string{"type": "boolean"},
},
"required": []string{"company_mission", "supports_sso", "is_open_source"},
},
},
}
response, err := app.ScrapeURL("https://mendable.ai", params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.Contains(t, response.LLMExtraction, "company_mission")
assert.IsType(t, true, response.LLMExtraction["supports_sso"])
assert.IsType(t, true, response.LLMExtraction["is_open_source"])
}
func TestCancelCrawlJobInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)
_, err = app.CancelCrawlJob("test query")
assert.Error(t, err)
assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token")
}
func TestCancelNonExistingCrawlJob(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
jobID := uuid.New().String()
_, err = app.CancelCrawlJob(jobID)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Job not found")
}
func TestCancelCrawlJobE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)
response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
require.NoError(t, err)
assert.NotNil(t, response)
jobID, ok := response.(string)
assert.True(t, ok)
assert.NotEqual(t, "", jobID)
status, err := app.CancelCrawlJob(jobID)
require.NoError(t, err)
assert.Equal(t, "cancelled", status)
}

View File

@ -1,15 +0,0 @@
module github.com/mendableai/firecrawl-go
go 1.22.5
require (
github.com/google/uuid v1.6.0
github.com/joho/godotenv v1.5.1
github.com/stretchr/testify v1.9.0
)
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

View File

@ -1,14 +0,0 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=