diff --git a/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb new file mode 100644 index 0000000..ee14f14 --- /dev/null +++ b/examples/simple_web_data_extraction_with_claude/simple_web_data_extraction_with_claude.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scraping and Extraction with Firecrawl and Claude\n", + "\n", + "This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import json\n", + "from firecrawl import FirecrawlApp\n", + "from anthropic import Anthropic\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Set Up API Keys and URL" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "URL to scrape: https://mendable.ai\n" + ] + } + ], + "source": [ + "# Retrieve API keys from environment variables\n", + "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Set the URL to scrape\n", + "url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n", + "\n", + "print(f\"URL to scrape: {url}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Initialize Firecrawl and Anthropic Clients" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Firecrawl and Anthropic clients initialized.\n" + ] + } + ], + "source": [ + "# Initialize FirecrawlApp and Anthropic client\n", + "firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n", + "anthropic_client = Anthropic(api_key=anthropic_api_key)\n", + "\n", + "print(\"Firecrawl and Anthropic clients initialized.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Scrape the URL using Firecrawl" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page content scraped. Length: 16199 characters\n" + ] + } + ], + "source": [ + "# Scrape the URL using Firecrawl\n", + "page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n", + "\n", + "print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Prepare the Prompt for Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt prepared for Claude.\n" + ] + } + ], + "source": [ + "# Prepare the prompt for Claude\n", + "prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n", + "1. The title of the page\n", + "2. Whether the company is part of Y Combinator (YC)\n", + "3. Whether the company/product is open source\n", + "\n", + "Return the information in JSON format with the following schema:\n", + "{{\n", + " \"main_header_title\": string,\n", + " \"is_yc_company\": boolean,\n", + " \"is_open_source\": boolean\n", + "}}\n", + "\n", + "Webpage content:\n", + "{page_content['content']}\n", + "\n", + "Return only the JSON, nothing else.\"\"\"\n", + "\n", + "print(\"Prompt prepared for Claude.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Query Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Claude response received.\n" + ] + } + ], + "source": [ + "# Query Claude\n", + "response = anthropic_client.messages.create(\n", + " model=\"claude-3-opus-20240229\",\n", + " max_tokens=1000,\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + ")\n", + "\n", + "print(\"Claude response received.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Parse and Display the Result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"title\": \"Just in time answers for Sales and Support\",\n", + " \"is_yc_company\": true,\n", + " \"is_open_source\": false\n", + "}\n" + ] + } + ], + "source": [ + "# Parse and print the result\n", + "result = json.loads(response.content[0].text)\n", + "print(json.dumps(result, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}