Add new example Web Scraping and Extraction with Firecrawl and Claude

2024-08-28 09:35:43 -04:00 · 2024-08-28 09:35:43 -04:00 · 51d1a2e5f2
parent ff08d7093e
commit 51d1a2e5f2
1 changed files with 259 additions and 0 deletions
--- a/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb
+++ b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb
@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Web Scraping and Extraction with Firecrawl and Claude\n",
+    "\n",
+    "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Import Required Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from firecrawl import FirecrawlApp\n",
+    "from anthropic import Anthropic\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Set Up API Keys and URL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "URL to scrape: https://mendable.ai\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Retrieve API keys from environment variables\n",
+    "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
+    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
+    "\n",
+    "# Set the URL to scrape\n",
+    "url = \"https://mendable.ai\"  # Replace with the actual URL you want to scrape\n",
+    "\n",
+    "print(f\"URL to scrape: {url}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Initialize Firecrawl and Anthropic Clients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Firecrawl and Anthropic clients initialized.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize FirecrawlApp and Anthropic client\n",
+    "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
+    "anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
+    "\n",
+    "print(\"Firecrawl and Anthropic clients initialized.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Scrape the URL using Firecrawl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Page content scraped. Length: 16199 characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Scrape the URL using Firecrawl\n",
+    "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
+    "\n",
+    "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5: Prepare the Prompt for Claude"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prompt prepared for Claude.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare the prompt for Claude\n",
+    "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
+    "1. The title of the page\n",
+    "2. Whether the company is part of Y Combinator (YC)\n",
+    "3. Whether the company/product is open source\n",
+    "\n",
+    "Return the information in JSON format with the following schema:\n",
+    "{{\n",
+    "    \"main_header_title\": string,\n",
+    "    \"is_yc_company\": boolean,\n",
+    "    \"is_open_source\": boolean\n",
+    "}}\n",
+    "\n",
+    "Webpage content:\n",
+    "{page_content['content']}\n",
+    "\n",
+    "Return only the JSON, nothing else.\"\"\"\n",
+    "\n",
+    "print(\"Prompt prepared for Claude.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6: Query Claude"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Claude response received.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Query Claude\n",
+    "response = anthropic_client.messages.create(\n",
+    "    model=\"claude-3-opus-20240229\",\n",
+    "    max_tokens=1000,\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": prompt}\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "print(\"Claude response received.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 7: Parse and Display the Result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"title\": \"Just in time answers for Sales and Support\",\n",
+      "  \"is_yc_company\": true,\n",
+      "  \"is_open_source\": false\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Parse and print the result\n",
+    "result = json.loads(response.content[0].text)\n",
+    "print(json.dumps(result, indent=2))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}