From 289af6f89e9f18a524786ca05997d4215b4d26a8 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Wed, 25 Sep 2024 21:10:09 -0400 Subject: [PATCH 1/4] example --- .../o1_job_recommender.ipynb | 672 ++++++++++++++++++ examples/o1_web_crawler_actions/main.ipynb | 60 ++ .../o1_web_crawler_actions.py | 271 +++++++ .../o1_web_crawler_actions/requirements.txt | 3 + 4 files changed, 1006 insertions(+) create mode 100644 examples/o1_job_recommender/o1_job_recommender.ipynb create mode 100644 examples/o1_web_crawler_actions/main.ipynb create mode 100644 examples/o1_web_crawler_actions/o1_web_crawler_actions.py create mode 100644 examples/o1_web_crawler_actions/requirements.txt diff --git a/examples/o1_job_recommender/o1_job_recommender.ipynb b/examples/o1_job_recommender/o1_job_recommender.ipynb new file mode 100644 index 0000000..8827817 --- /dev/null +++ b/examples/o1_job_recommender/o1_job_recommender.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "import os\n", + "import datetime\n", + "import time\n", + "import requests\n", + "import json\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "from pydantic import BaseModel, Field\n", + "from typing import List\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n", + "\n", + "# Set the jobs page URL\n", + "jobs_page_url = \"https://openai.com/careers\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total pages mapped (excluding original URL): 14\n", + "['https://openai.com/careers/research-scientist', 'https://openai.com/careers/analytics-engineer', 'https://openai.com/careers/solutions-architect', 'https://openai.com/careers/iam-engineer', 'https://openai.com/careers/talent-partnerships', 'https://openai.com/careers/product-designer', 'https://openai.com/careers/recruiting-coordinator', 'https://openai.com/careers/av-specialist', 'https://openai.com/careers/it-support', 'https://openai.com/careers/director-edu', 'https://openai.com/careers/research-engineer', 'https://openai.com/careers/solutions-engineer', 'https://openai.com/careers/software-engineer-networking', 'https://openai.com/careers/revenue-operations-leader']\n" + ] + } + ], + "source": [ + "# %%\n", + "# Use the Firecrawl Map API to get the sitemap\n", + "api_url = \"https://api.firecrawl.dev/v1/map\"\n", + "payload = {\n", + " \"url\": jobs_page_url,\n", + " \"search\": \"\", # Empty search term to get all pages\n", + " \"limit\": 15\n", + "}\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {firecrawl_api_key}\",\n", + " \"Content-Type\": \"application/json\"\n", + "}\n", + "response = requests.post(api_url, json=payload, headers=headers)\n", + "\n", + "if response.status_code == 200:\n", + " map_result = response.json()\n", + " if map_result.get('success'):\n", + " links = [link for link in map_result.get('links', []) if link != jobs_page_url]\n", + " print(f\"Total pages mapped (excluding original URL): {len(links)}\")\n", + " print(links)\n", + " else:\n", + " print(\"Map API request was not successful\")\n", + " exit(1)\n", + "else:\n", + " print(f\"Error: {response.status_code}\")\n", + " print(response.text)\n", + " exit(1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error 500 for page 0: {\"success\":false,\"error\":\"(Internal server error) - JSON parsing error(s): must be object\\n\\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. - Could be due to LLM parsing issues\"}\n", + "Data extracted for page 1\n", + "Data extracted for page 2\n", + "Data extracted for page 3\n", + "Data extracted for page 4\n", + "Data extracted for page 5\n", + "Data extracted for page 6\n", + "Data extracted for page 7\n", + "Data extracted for page 8\n", + "Data extracted for page 9\n", + "Data extracted for page 10\n", + "Data extracted for page 11\n", + "Data extracted for page 12\n", + "Data extracted for page 13\n" + ] + } + ], + "source": [ + "# %%\n", + "# Define the extraction schema\n", + "extract_schema = {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"job_title\": {\n", + " \"type\": \"string\"\n", + " },\n", + " \"sub_division_of_organization\": {\n", + " \"type\": \"string\"\n", + " },\n", + " \"key_skills\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"string\"\n", + " }\n", + " },\n", + " \"compensation\": {\n", + " \"type\": \"string\"\n", + " },\n", + " \"apply_link\": {\n", + " \"type\": \"string\"\n", + " }\n", + " },\n", + " \"required\": [\"job_title\", \"sub_division_of_organization\", \"key_skills\", \"compensation\", \"apply_link\"]\n", + "}\n", + "\n", + "# Initialize a list to store the extracted data\n", + "extracted_data = []\n", + "\n", + "# Process each link in the map result\n", + "for index, link in enumerate(links):\n", + " try:\n", + " response = requests.post(\n", + " \"https://api.firecrawl.dev/v1/scrape\",\n", + " headers={\n", + " \"Content-Type\": \"application/json\",\n", + " \"Authorization\": f\"Bearer {firecrawl_api_key}\"\n", + " },\n", + " json={\n", + " \"url\": link,\n", + " \"formats\": [\"extract\"],\n", + " \"extract\": {\n", + " \"schema\": extract_schema\n", + " }\n", + " }\n", + " )\n", + " \n", + " if response.status_code == 200:\n", + " result = response.json()\n", + " if result.get('success'):\n", + " extracted_data.append(result['data']['extract'])\n", + " print(f\"Data extracted for page {index}\")\n", + " else:\n", + " print(f\"No data extracted for page {index}\")\n", + " else:\n", + " print(f\"Error {response.status_code} for page {index}: {response.text}\")\n", + " except Exception as e:\n", + " print(f\"An error occurred for page {index}: {str(e)}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data:\n", + "{\n", + " \"job_title\": \"Analytics Engineer\",\n", + " \"sub_division_of_organization\": \"Growth\",\n", + " \"key_skills\": [\n", + " \"SQL\",\n", + " \"Python\",\n", + " \"business intelligence tools\",\n", + " \"ETL workflows\",\n", + " \"data analysis\",\n", + " \"dashboards\",\n", + " \"data storytelling\"\n", + " ],\n", + " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Solutions Architect\",\n", + " \"sub_division_of_organization\": \"Technical Success\",\n", + " \"key_skills\": [\n", + " \"technical consulting\",\n", + " \"Generative AI\",\n", + " \"ML solutions\",\n", + " \"network architecture\",\n", + " \"cloud architecture\",\n", + " \"Python\",\n", + " \"Javascript\"\n", + " ],\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"IAM Engineer\",\n", + " \"sub_division_of_organization\": \"IT\",\n", + " \"key_skills\": [\n", + " \"AzureAD\",\n", + " \"Python\",\n", + " \"PowerShell\",\n", + " \"identity governance\",\n", + " \"automation\",\n", + " \"Terraform\"\n", + " ],\n", + " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/e798aa62-74f9-4f53-a890-716310926b70/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Talent Partnerships\",\n", + " \"sub_division_of_organization\": \"Communications\",\n", + " \"key_skills\": [\n", + " \"relationship management\",\n", + " \"communication\",\n", + " \"adaptability\",\n", + " \"creativity\",\n", + " \"collaboration\",\n", + " \"transparency\"\n", + " ],\n", + " \"compensation\": \"$171K \\u2013 $240K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/84a4a8bb-7d5a-4989-9b5c-bd841db2698e/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"404 Error Page\",\n", + " \"sub_division_of_organization\": \"Web Development\",\n", + " \"key_skills\": [\n", + " \"Error Handling\",\n", + " \"Web Design\",\n", + " \"User Experience\"\n", + " ],\n", + " \"compensation\": \"N/A\",\n", + " \"apply_link\": \"N/A\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"\",\n", + " \"sub_division_of_organization\": \"\",\n", + " \"key_skills\": [],\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"AV Specialist\",\n", + " \"sub_division_of_organization\": \"IT\",\n", + " \"key_skills\": [\n", + " \"AV support\",\n", + " \"Google Meet\",\n", + " \"Zoom\",\n", + " \"Cisco\",\n", + " \"ticket management\",\n", + " \"IT troubleshooting\",\n", + " \"problem-solving\",\n", + " \"interpersonal skills\"\n", + " ],\n", + " \"compensation\": \"$110K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/20fd0ff8-dd5e-4bec-a401-dd3f8263fe24/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"IT Support\",\n", + " \"sub_division_of_organization\": \"IT\",\n", + " \"key_skills\": [\n", + " \"Intermediate-to-expert understanding of IDP and MDM solutions\",\n", + " \"Familiarity with Windows or Linux\",\n", + " \"Understanding of Python, Bash, or Apple Script\",\n", + " \"Experience with collaboration software\",\n", + " \"Hands-on expertise implementing and managing AV and telecom systems\",\n", + " \"Complete Mac and macOS troubleshooting skills\",\n", + " \"Adept in orchestrating high-production events\"\n", + " ],\n", + " \"compensation\": \"$110K \\u2013 $140K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/ca263679-08d5-4492-9a56-32fbcb7318a5/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"404\",\n", + " \"sub_division_of_organization\": \"OpenAI\",\n", + " \"key_skills\": [],\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Research Engineer\",\n", + " \"sub_division_of_organization\": \"Research\",\n", + " \"key_skills\": [\n", + " \"strong programming skills\",\n", + " \"experience working in large distributed systems\"\n", + " ],\n", + " \"compensation\": \"$295K \\u2013 $440K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/240d459b-696d-43eb-8497-fab3e56ecd9b/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Solutions Engineer\",\n", + " \"sub_division_of_organization\": \"Technical Success\",\n", + " \"key_skills\": [\n", + " \"7+ years of experience in a technical pre-sales role\",\n", + " \"Understanding of IT security principles\",\n", + " \"Experience with programming languages like Python or Javascript\",\n", + " \"Knowledge of network/cloud architecture\",\n", + " \"Effective presentation and communication skills\",\n", + " \"Ability to manage C-level technical and business relationships\"\n", + " ],\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/dbfef1b0-9a77-46bd-ad36-67f3d0286924/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Software Engineer, Networking\",\n", + " \"sub_division_of_organization\": \"Platform\",\n", + " \"key_skills\": [\n", + " \"C++\",\n", + " \"CUDA\",\n", + " \"distributed algorithms\",\n", + " \"RDMA\",\n", + " \"network simulation techniques\"\n", + " ],\n", + " \"compensation\": \"$360K \\u2013 $530K\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340c0c22-8d8f-4232-b17e-f642b64c25c3/application\"\n", + "}\n", + "--------------------------------------------------\n", + "{\n", + " \"job_title\": \"Revenue Operations Leader\",\n", + " \"sub_division_of_organization\": \"Revenue Operations\",\n", + " \"key_skills\": [\n", + " \"Extensive experience in revenue operations or strategy at a high-growth, technology company\",\n", + " \"Proficiency with GTM systems, namely SFDC, Gong\",\n", + " \"Experience managing a large team of 15+ operational team members\",\n", + " \"Highly analytical\",\n", + " \"Exceptional project management skills with experience leading complex, cross-functional initiatives\",\n", + " \"Deep experience designing & executing on a territory strategy for 100+ GTM orgs\",\n", + " \"Strong communication skills and executive presence\",\n", + " \"An understanding of the AI landscape, our applications, and the problems they solve for our customers\",\n", + " \"The ability to thrive in ambiguity and work autonomously\"\n", + " ],\n", + " \"compensation\": \"$325K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/61a484e5-4723-4031-92c1-068dfe4b069f/application\"\n", + "}\n", + "--------------------------------------------------\n", + "Extracted data saved to /Users/ericciarla/Documents/GitHub/firecrawl/examples/getting_latest_openai_jobs/openai_jobs.csv\n" + ] + } + ], + "source": [ + "# %%\n", + "# Print the extracted data\n", + "print(\"Extracted data:\")\n", + "for job in extracted_data:\n", + " print(json.dumps(job, indent=2))\n", + " print(\"-\" * 50) # Separator between jobs\n", + "\n", + "# Save as CSV\n", + "import csv\n", + "import os\n", + "\n", + "# Get the current directory\n", + "current_dir = os.getcwd()\n", + "\n", + "# Create the full path for the CSV file\n", + "csv_file = os.path.join(current_dir, \"openai_jobs.csv\")\n", + "\n", + "try:\n", + " with open(csv_file, \"w\", newline=\"\") as f:\n", + " if extracted_data:\n", + " writer = csv.DictWriter(f, fieldnames=extracted_data[0].keys())\n", + " writer.writeheader()\n", + " for job in extracted_data:\n", + " writer.writerow(job)\n", + " print(f\"Extracted data saved to {csv_file}\")\n", + " else:\n", + " print(\"No data to save.\")\n", + "except IOError as e:\n", + " print(f\"Error saving CSV file: {e}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recommended jobs:\n", + "[\n", + " {\n", + " \"job_title\": \"Analytics Engineer\",\n", + " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application\"\n", + " },\n", + " {\n", + " \"job_title\": \"Solutions Architect\",\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application\"\n", + " },\n", + " {\n", + " \"job_title\": \"Research Engineer\",\n", + " \"compensation\": \"$295K \\u2013 $440K + Offers Equity\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/240d459b-696d-43eb-8497-fab3e56ecd9b/application\"\n", + " },\n", + " {\n", + " \"job_title\": \"Solutions Engineer\",\n", + " \"compensation\": \"\",\n", + " \"apply_link\": \"https://jobs.ashbyhq.com/openai/dbfef1b0-9a77-46bd-ad36-67f3d0286924/application\"\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "from openai import OpenAI\n", + "\n", + "# Resume\n", + "resume_paste = \"\"\"\"\n", + "Eric Ciarla\n", + "Co-Founder @ Firecrawl\n", + "San Francisco, California, United States\n", + "Summary\n", + "Building…\n", + "Experience\n", + "Firecrawl\n", + "Co-Founder\n", + "April 2024 - Present (6 months)\n", + "San Francisco, California, United States\n", + "Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by\n", + "Amazon, Zapier, and Nvidia (YC S22)\n", + "Mendable\n", + "2 years 7 months\n", + "Co-Founder @ Mendable.ai\n", + "March 2022 - Present (2 years 7 months)\n", + "San Francisco, California, United States\n", + "- Built an AI powered search platform that that served millions of queries for\n", + "hundreds of customers (YC S22)\n", + "- We were one of the first LLM powered apps adopted by industry leaders like\n", + "Coinbase, Snap, DoorDash, and MongoDB\n", + "Co-Founder @ SideGuide\n", + "March 2022 - Present (2 years 7 months)\n", + "San Francisco, California, United States\n", + "- Built and scaled an online course platform with a community of over 50,000\n", + "developers\n", + "- Selected for Y Combinator S22 batch, 2% acceptance rate\n", + "Fracta\n", + "Data Engineer\n", + "2022 - 2022 (less than a year)\n", + "Palo Alto, California, United States\n", + "- Demoed tool during sales calls and provided technical support during the\n", + "entire customer lifecycle\n", + "Page 1 of 2\n", + "- Mined, wrangled, & visualized geospatial and water utility data for predictive\n", + "analytics & ML workflows (Python, QGIS)\n", + "Ford Motor Company\n", + "Data Scientist\n", + "2021 - 2021 (less than a year)\n", + "Dearborn, Michigan, United States\n", + "- Extracted, cleaned, and joined data from multiple sources using SQL,\n", + "Hadoop, and Alteryx\n", + "- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the\n", + "relationships between survey free response verbatim topics (derived from\n", + "natural language processing models) and numerical customer experience\n", + "scores\n", + "MDRemindME\n", + "Co-Founder\n", + "2018 - 2020 (2 years)\n", + "Durham, New Hampshire, United States\n", + "- Founded and led a healthtech startup aimed at improving patient adherence\n", + "to treatment plans through an innovative engagement and retention tool\n", + "- Piloted the product with healthcare providers and patients, gathering critical\n", + "insights to refine functionality and enhance user experience\n", + "- Secured funding through National Science Foundation I-CORPS Grant and\n", + "UNH Entrepreneurship Center Seed Grant\n", + "Education\n", + "Y Combinator\n", + "S22\n", + "University of New Hampshire\n", + "Economics and Philosophy\n", + "\"\"\"\n", + "\n", + "# Use o1-preview to choose which jobs should be applied to based on the resume\n", + "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", + "\n", + "prompt = f\"\"\"\n", + "Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:\n", + "\n", + "[\n", + " {{\n", + " \"job_title\": \"Job Title\",\n", + " \"compensation\": \"Compensation (if available, otherwise empty string)\",\n", + " \"apply_link\": \"Application URL\"\n", + " }},\n", + " ...\n", + "]\n", + "\n", + "Based on the following resume:\n", + "{resume_paste}\n", + "\n", + "And the following job listings:\n", + "{json.dumps(extracted_data, indent=2)}\n", + "\"\"\"\n", + "\n", + "completion = client.chat.completions.create(\n", + " model=\"o1-preview\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "recommended_jobs = json.loads(completion.choices[0].message.content.strip())\n", + "\n", + "print(\"Recommended jobs:\")\n", + "print(json.dumps(recommended_jobs, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scrape each of the apply links with firecrawl /v1/scrape\n", + "import requests\n", + "\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "def scrape_apply_link(url):\n", + " api_url = \"https://api.firecrawl.dev/v1/scrape\"\n", + " headers = {\n", + " \"Authorization\": f\"Bearer {firecrawl_api_key}\",\n", + " \"Content-Type\": \"application/json\"\n", + " }\n", + " payload = {\n", + " \"url\": url\n", + " }\n", + " \n", + " response = requests.post(api_url, json=payload, headers=headers)\n", + " if response.status_code == 200:\n", + " return response.json()\n", + " else:\n", + " print(f\"Error scraping {url}: {response.status_code}\")\n", + " return None\n", + "\n", + "scraped_job_data = []\n", + "for job in recommended_jobs:\n", + " apply_link = job.get('apply_link')\n", + " if apply_link:\n", + " scraped_data = scrape_apply_link(apply_link)\n", + " if scraped_data:\n", + " scraped_job_data.append({\n", + " 'job_title': job['job_title'],\n", + " 'compensation': job['compensation'],\n", + " 'apply_link': apply_link,\n", + " 'scraped_content': scraped_data\n", + " })\n", + "\n", + "print(f\"Scraped {len(scraped_job_data)} job application pages\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use o1 to write the application for you and return in json\n", + "import json\n", + "\n", + "\n", + "def generate_application(job_data, resume_paste):\n", + " # Extract relevant information from scraped content\n", + " scraped_text = job_data['scraped_content'].get('text', '')\n", + " \n", + " prompt = f\"\"\"\n", + " Based on the following job information, scraped content from the application page, and the provided resume, write a tailored job application:\n", + "\n", + " Job Title: {job_data['job_title']}\n", + " Compensation: {job_data['compensation']}\n", + " Scraped Content: {scraped_text[:1000]} # Limit to first 1000 characters to avoid token limits\n", + "\n", + " Resume:\n", + " {resume_paste}\n", + "\n", + " Please format the application as a JSON object with the following fields:\n", + " - cover_letter: A personalized cover letter addressing key points from the scraped content and highlighting relevant experience from the resume\n", + " - resume_highlights: Key points from the resume that align with the job requirements mentioned in the scraped content\n", + " - questions: Any questions you have about the position, derived from the available information\n", + "\n", + " Ensure the content is specifically tailored to the information provided in the scraped content and leverages the experience detailed in the resume.\n", + " \"\"\"\n", + "\n", + " try:\n", + " completion = client.chat.completions.create(\n", + " model=\"o1-preview\",\n", + " messages=[\n", + " \n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " )\n", + " return json.loads(completion.choices[0].message.content)\n", + " except Exception as e:\n", + " print(f\"Error generating application: {str(e)}\")\n", + " return None\n", + "\n", + "\n", + "\n", + "applications = []\n", + "for job in scraped_job_data:\n", + " application = generate_application(job, resume_paste)\n", + " if application:\n", + " applications.append({\n", + " \"job_title\": job[\"job_title\"],\n", + " \"apply_link\": job[\"apply_link\"],\n", + " \"application\": application\n", + " })\n", + "\n", + "print(f\"Generated {len(applications)} job applications based on scraped content and resume\")\n", + "print(json.dumps(applications, indent=2))\n", + "\n", + "# Save the JSON to a file\n", + "output_file = \"generated_applications.json\"\n", + "with open(output_file, \"w\") as f:\n", + " json.dump(applications, f, indent=2)\n", + "\n", + "print(f\"Saved generated applications to {output_file}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/o1_web_crawler_actions/main.ipynb b/examples/o1_web_crawler_actions/main.ipynb new file mode 100644 index 0000000..7e30c5f --- /dev/null +++ b/examples/o1_web_crawler_actions/main.ipynb @@ -0,0 +1,60 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'success': True, 'data': {'html': '

Our first Launch Week is over! See the recap 🚀

💥 Get 2 months free with yearly plan

Turn websites into
LLM-ready data

Power your AI apps with clean data crawled from any website. It\\'s also open-source.

Talk to us
A product by
\"MendableMendable
\"Example

Crawl, Scrape, Clean

We crawl all accessible subpages and give you clean markdown for each. No sitemap required.

\\n  [\\n    {\\n      \"url\": \"https://www.firecrawl.dev/\",\\n      \"markdown\": \"## Welcome to Firecrawl\\n        Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/features\",\\n      \"markdown\": \"## Features\\n        Discover how Firecrawl\\'s cutting-edge features can \\n        transform your data operations.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/pricing\",\\n      \"markdown\": \"## Pricing Plans\\n        Choose the perfect plan that fits your needs.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/about\",\\n      \"markdown\": \"## About Us\\n        Learn more about Firecrawl\\'s mission and the \\n        team behind our innovative platform.\"\\n    }\\n  ]\\n  

Note: The markdown has been edited for display purposes.

Trusted by Top Companies

\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer
\"Customer

Integrate today

Enhance your applications with top-tier web scraping and crawling capabilities.

1
2
3
4
5
6
7
8
9
10
// npm install @mendable/firecrawl-js

import FirecrawlApp from \\'@mendable/firecrawl-js\\';

const app = new FirecrawlApp({ apiKey: \"fc-YOUR_API_KEY\" });

// Scrape a website:
const scrapeResult = await app.scrapeUrl(\\'firecrawl.dev\\');

console.log(scrapeResult.data.markdown)

Use well-known tools

Already fully integrated with the greatest existing tools and workflows.
\"LlamaIndex\"\"Langchain\"\"Dify\"\"Dify\"\"Flowise\"\"CrewAI\"

Start for free, scale easily

Kick off your journey for free and scale seamlessly as your project expands.

Open-source

Developed transparently and collaboratively. Join our community of contributors.

We handle the hard stuff

Rotating proxies, caching, rate limits, js-blocked content and more

Crawling

Firecrawl crawls all accessible subpages, even without a sitemap.

Dynamic content

Firecrawl gathers data even if a website uses javascript to render content.

To Markdown

Firecrawl returns clean, well formatted markdown - ready for use in LLM applications

Crawling Orchestration

Firecrawl orchestrates the crawling process in parallel for the fastest results.

Caching

Firecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.

Built for AI

Built by LLM engineers, for LLM engineers. Giving you clean data the way you want it.

Our wall of love

Don\\'t take our word for it

div]:mx-3 animate-infinite-scroll group-hover:[animation-play-state:paused]\">
\"Greg
Greg Kamradt
LLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.
\"Amit
Amit Naik
#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏
\"Jerry
Jerry Liu
Firecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.
\"Bardia
Bardia Pourvakil
These guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.
\"latentsauce
latentsauce 🧘🏽
Firecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️
div]:mx-3 animate-infinite-scroll group-hover:[animation-play-state:paused]\" aria-hidden=\"true\">
\"Greg
Greg Kamradt
LLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.
\"Amit
Amit Naik
#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏
\"Jerry
Jerry Liu
Firecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.
\"Bardia
Bardia Pourvakil
These guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.
\"latentsauce
latentsauce 🧘🏽
Firecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️
div]:mx-3 animate-infinite-scroll-inverse group-hover:[animation-play-state:paused] [animation-delay:-7.5s]\">
\"Michael
Michael Ning
Firecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.
\"Alex
Alex Reibman 🖇️
Moved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.
I really like some of the design decisions Firecrawl made, so I really want to share with others.
\"Paul
Paul Scott
Appreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.
div]:mx-3 animate-infinite-scroll-inverse group-hover:[animation-play-state:paused] [animation-delay:-7.5s]\" aria-hidden=\"true\">
\"Michael
Michael Ning
Firecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.
\"Alex
Alex Reibman 🖇️
Moved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.
I really like some of the design decisions Firecrawl made, so I really want to share with others.
\"Paul
Paul Scott
Appreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.

Flexible Pricing

Start for free, then scale as you grow

Free Plan

500 credits

$0 one-time

  • Scrape 500 pages
  • 10 /scrape per min
  • 1 /crawl per min

Hobby

3,000 credits

$16/month

Billed annually
  • Scrape 3,000 pages*
  • 20 /scrape per min
  • 3 /crawl per min

StandardMost Popular

100,000 credits

$83/month

Billed annually
  • Scrape 100,000 pages*
  • 100 /scrape per min
  • 10 /crawl per min
  • 2 seats

Growth

500,000 credits

$333/month

Billed annually
  • Scrape 500,000 pages*
  • 1000 /scrape per min
  • 50 /crawl per min
  • 4 seats
  • Priority Support

Enterprise Plan

Unlimited credits. Custom RPMs.

  • Top priority support
  • Feature Acceleration
  • SLAs
  • Account Manager
  • Custom rate limits volume
  • Custom concurrency limits
  • Custom seats
  • CEO\\'s number

* a /scrape refers to the scrape API endpoint. Structured extraction costs vary. See credits table.

* a /crawl refers to the crawl API endpoint.

API Credits

Credits are consumed for each API request, varying by endpoint and feature.

FeaturesCredits
Scrape(/scrape)1 / page
Crawl(/crawl)1 / page
Map (/map)1 / call
Search(/search)1 / page
Scrape + LLM extraction (/scrape)5 / page

Ready to Build?

Start scraping web data for your AI apps today.
No credit card needed.

FAQ

Frequently asked questions about Firecrawl

General

What is Firecrawl?

Firecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.

What sites work?

Firecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.

Who can benefit from using Firecrawl?

Firecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.

Is Firecrawl open-source?

Yes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.

Scraping & Crawling

How does Firecrawl handle dynamic content on websites?

Unlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.

Why is it not crawling all the pages?

There are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at help@firecrawl.com.

Can Firecrawl crawl websites without a sitemap?

Yes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.

What formats can Firecrawl convert web data into?

Firecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.

How does Firecrawl ensure the cleanliness of the data?

Firecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.

Is Firecrawl suitable for large-scale data scraping projects?

Absolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.

Does it respect robots.txt?

Yes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at help@firecrawl.com.

What measures does Firecrawl take to handle web scraping challenges like rate limits and caching?

Firecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.

Does Firecrawl handle captcha or authentication?

Firecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.

API Related

Where can I find my API key?

Click on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.

Billing

Is Firecrawl free?

Firecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.

Is there a pay per use plan instead of monthly?

No we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.

How many credit does scraping, crawling, and extraction cost?

Scraping costs 1 credit per page. Crawling costs 1 credit per page.

Do you charge for failed requests (scrape, crawl, extract)?

We do not charge for any failed requests (scrape, crawl, extract). Please contact support at caleb@firecrawl.com if you have any questions.

What payment methods do you accept?

We accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.

    ', 'actions': {'screenshots': []}, 'metadata': {'title': 'Home - Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'language': 'en', 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'robots': 'follow, index', 'ogTitle': 'Firecrawl', 'ogDescription': 'Turn any website into LLM-ready data.', 'ogUrl': 'https://www.firecrawl.dev/', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogLocaleAlternate': [], 'ogSiteName': 'Firecrawl', 'sourceURL': 'https://www.firecrawl.dev', 'statusCode': 200}}}\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "payload = {\n", + " \"url\": \"https://www.firecrawl.dev\",\n", + " \"formats\": [\"html\"],\n", + " \"actions\": [{'type': 'click', 'selector': 'a[href=\"https://calendly.com/d/cj83-ngq-knk/meet-firecrawl\"]'}]\n", + " }\n", + "headers = {\n", + " \"Authorization\": f\"Bearer fc-fa95acf54c0e496fbe6b403745f246ab\",\n", + " \"Content-Type\": \"application/json\"\n", + " }\n", + "\n", + "response = requests.post(\"https://api.firecrawl.dev/v1/scrape\", json=payload, headers=headers)\n", + " \n", + "\n", + "scrape_result = response.json() \n", + "print(scrape_result)\n", + "\n", + " \n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/o1_web_crawler_actions/o1_web_crawler_actions.py b/examples/o1_web_crawler_actions/o1_web_crawler_actions.py new file mode 100644 index 0000000..e9d5698 --- /dev/null +++ b/examples/o1_web_crawler_actions/o1_web_crawler_actions.py @@ -0,0 +1,271 @@ +import os +import json +import requests +from dotenv import load_dotenv +from openai import OpenAI +import re + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +openai_api_key = os.getenv("OPENAI_API_KEY") + +# Initialize the OpenAI client +client = OpenAI(api_key=openai_api_key) + +# Step 1: Get objective and URL +def get_objective_and_url(): + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + return objective, url + +# Function to get top N pages from a URL using Firecrawl Map API +def get_top_pages(url, search_term, num_pages=3): + try: + print(f"{Colors.YELLOW}Mapping website using the Firecrawl Map API...{Colors.RESET}") + api_url = "https://api.firecrawl.dev/v1/map" + payload = { + "url": url, + "search": search_term, + } + headers = { + "Authorization": f"Bearer {firecrawl_api_key}", + "Content-Type": "application/json" + } + response = requests.post(api_url, json=payload, headers=headers) + if response.status_code == 200: + map_result = response.json() + + if map_result.get('success'): + links = map_result.get('links', []) + top_pages = links[:num_pages] + print(f"{Colors.GREEN}Found {len(links)} links. Using top {num_pages} pages.{Colors.RESET}") + for i, page in enumerate(top_pages, 1): + print(f"{Colors.CYAN}URL {i}: {page}{Colors.RESET}") + return top_pages + else: + print(f"{Colors.RED}Error: Map API request was not successful{Colors.RESET}") + return [] + else: + print(f"{Colors.RED}Error: Received status code {response.status_code} from Map API{Colors.RESET}") + return [] + except Exception as e: + print(f"{Colors.RED}Error encountered during mapping: {str(e)}{Colors.RESET}") + return [] + +# Step 2: Visit a page and get HTML +def visit_page_and_get_html(url, actions): + try: + if actions: + print(f"{Colors.YELLOW}Scraping page: {url} with actions:{Colors.RESET}") + for action in actions: + print(f" - {action}") + else: + print(f"{Colors.YELLOW}Scraping page: {url}{Colors.RESET}") + + payload = { + "url": url, + "formats": ["html"], + "actions": actions + } + headers = { + "Authorization": f"Bearer {firecrawl_api_key}", + "Content-Type": "application/json" + } + + response = requests.post("https://api.firecrawl.dev/v1/scrape", json=payload, headers=headers) + + if response.status_code == 200: + scrape_result = response.json() + html_content = scrape_result["data"]["html"] + if len(actions) > 0: + print("html_content: ", scrape_result) + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + return html_content + else: + print(f"{Colors.RED}Error: Received status code {response.status_code}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Error encountered during page scraping: {str(e)}{Colors.RESET}") + return None + +# Step 3: Process the page to fulfill the objective or decide next action +def process_page(html_content, objective): + try: + process_prompt = f""" +You are an AI assistant helping to achieve the following objective: '{objective}'. +Given the HTML content of a web page, determine if the objective is met. + +Instructions: +1. If the objective is met, respond in JSON format as follows: +{{ + "status": "Objective met", + "data": {{ ... extracted information ... }} +}} + +2. If the objective is not met, analyze the HTML content to decide the best next action to get closer to the objective. Provide the action(s) needed to navigate to the next page or interact with the page. Respond in JSON format as follows: +{{ + "status": "Objective not met", + "actions": [{{ ... actions to perform ... }}] +}} + +3. The actions should be in the format accepted by the 'actions' parameter of the 'scrape_url' function in Firecrawl. Available actions include: + - {{"type": "wait", "milliseconds": }} + Example: {{"type": "wait", "milliseconds": 2000}} + - {{"type": "click", "selector": ""}} + Example: {{"type": "click", "selector": "#load-more-button"}} + - {{"type": "write", "text": "", "selector": ""}} + Example: {{"type": "write", "text": "Hello, world!", "selector": "#search-input"}} + - {{"type": "press", "key": ""}} + Example: {{"type": "press", "key": "Enter"}} + - {{"type": "scroll", "direction": "", "amount": }} + Example: {{"type": "scroll", "direction": "down", "amount": 500}} + +4. Do not include any explanations or additional text outside of the JSON response. + +HTML Content: +{html_content[:20000]} +""" + + completion = client.chat.completions.create( + model="o1-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": process_prompt + } + ] + } + ] + ) + + response = completion.choices[0].message.content.strip() + + # Remove any JSON code blocks from the response + response = re.sub(r'```json\s*(.*?)\s*```', r'\1', response, flags=re.DOTALL) + + # Parse the response as JSON + try: + result = json.loads(response) + status = result.get('status') + if status == 'Objective met': + data = result.get('data') + return {'result': data} + elif status == 'Objective not met': + actions = result.get('actions') + return {'actions': actions} + else: + print(f"{Colors.RED}Unexpected status in response: {status}{Colors.RESET}") + return {} + except json.JSONDecodeError: + print(f"{Colors.RED}Error parsing assistant's response as JSON.{Colors.RESET}") + print(f"{Colors.RED}Response was: {response}{Colors.RESET}") + return {} + except Exception as e: + print(f"{Colors.RED}Error encountered during processing of the page: {str(e)}{Colors.RESET}") + return {} + +# Function to determine search term based on the objective +def determine_search_term(objective): + try: + prompt = f""" +Based on the following objective: '{objective}', provide a 1-2 word search term that would help find relevant pages on the website. Only respond with the search term and nothing else. +""" + completion = client.chat.completions.create( + model="o1-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt + } + ] + } + ] + ) + search_term = completion.choices[0].message.content.strip() + print(f"{Colors.GREEN}Determined search term: {search_term}{Colors.RESET}") + return search_term + except Exception as e: + print(f"{Colors.RED}Error determining search term: {str(e)}{Colors.RESET}") + return "" + +# Main function +def main(): + objective, url = get_objective_and_url() + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + + # Determine search term based on objective + search_term = determine_search_term(objective) + if not search_term: + print(f"{Colors.RED}Could not determine a search term based on the objective.{Colors.RESET}") + return + + # Get the top 3 pages using Firecrawl Map API + top_pages = get_top_pages(url, search_term, num_pages=3) + if not top_pages: + print(f"{Colors.RED}No pages found to process.{Colors.RESET}") + return + + for page_url in top_pages: + print(f"{Colors.CYAN}Processing page: {page_url}{Colors.RESET}") + + # Step 2: Visit page and get HTML + html_content = visit_page_and_get_html(page_url, actions=[]) + if not html_content: + print(f"{Colors.RED}Failed to retrieve content from {page_url}{Colors.RESET}") + continue + + # Step 3: Process HTML and objective + action_result = process_page(html_content, objective) + if action_result.get('result'): + print(f"{Colors.GREEN}Objective met. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(action_result['result'], indent=2)}{Colors.RESET}") + return + elif action_result.get('actions'): + print(f"{Colors.YELLOW}Objective not met yet. Suggested actions:{Colors.RESET}") + for action in action_result['actions']: + print(f"{Colors.MAGENTA}- {action}{Colors.RESET}") + actions = action_result['actions'] + # Visit the page again with the actions + html_content = visit_page_and_get_html(page_url, actions) + if not html_content: + print(f"{Colors.RED}Failed to retrieve content from {page_url} with actions{Colors.RESET}") + continue + # Process the new HTML + action_result = process_page(html_content, objective) + if action_result.get('result'): + print(f"{Colors.GREEN}Objective met after performing actions. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(action_result['result'], indent=2)}{Colors.RESET}") + return + else: + print(f"{Colors.RED}Objective still not met after performing actions on {page_url}{Colors.RESET}") + continue + else: + print(f"{Colors.RED}No actions suggested. Unable to proceed with {page_url}.{Colors.RESET}") + continue + + # If we reach here, the objective was not met on any of the pages + print(f"{Colors.RED}Objective not fulfilled after processing top 3 pages.{Colors.RESET}") + +if __name__ == "__main__": + main() diff --git a/examples/o1_web_crawler_actions/requirements.txt b/examples/o1_web_crawler_actions/requirements.txt new file mode 100644 index 0000000..249f8be --- /dev/null +++ b/examples/o1_web_crawler_actions/requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py +python-dotenv +openai \ No newline at end of file From 51bc2f25fe553e0beb631c7cbe8c1d2929be72d8 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 26 Sep 2024 11:44:55 -0400 Subject: [PATCH 2/4] remove actions crawler --- examples/o1_web_crawler_actions/main.ipynb | 60 ---- .../o1_web_crawler_actions.py | 271 ------------------ .../o1_web_crawler_actions/requirements.txt | 3 - 3 files changed, 334 deletions(-) delete mode 100644 examples/o1_web_crawler_actions/main.ipynb delete mode 100644 examples/o1_web_crawler_actions/o1_web_crawler_actions.py delete mode 100644 examples/o1_web_crawler_actions/requirements.txt diff --git a/examples/o1_web_crawler_actions/main.ipynb b/examples/o1_web_crawler_actions/main.ipynb deleted file mode 100644 index 7e30c5f..0000000 --- a/examples/o1_web_crawler_actions/main.ipynb +++ /dev/null @@ -1,60 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'success': True, 'data': {'html': '

    Our first Launch Week is over! See the recap 🚀

    💥 Get 2 months free with yearly plan

    Turn websites into
    LLM-ready data

    Power your AI apps with clean data crawled from any website. It\\'s also open-source.

    Talk to us
    A product by
    \"MendableMendable
    \"Example

    Crawl, Scrape, Clean

    We crawl all accessible subpages and give you clean markdown for each. No sitemap required.

    \\n  [\\n    {\\n      \"url\": \"https://www.firecrawl.dev/\",\\n      \"markdown\": \"## Welcome to Firecrawl\\n        Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/features\",\\n      \"markdown\": \"## Features\\n        Discover how Firecrawl\\'s cutting-edge features can \\n        transform your data operations.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/pricing\",\\n      \"markdown\": \"## Pricing Plans\\n        Choose the perfect plan that fits your needs.\"\\n    },\\n    {\\n      \"url\": \"https://www.firecrawl.dev/about\",\\n      \"markdown\": \"## About Us\\n        Learn more about Firecrawl\\'s mission and the \\n        team behind our innovative platform.\"\\n    }\\n  ]\\n  

    Note: The markdown has been edited for display purposes.

    Trusted by Top Companies

    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer
    \"Customer

    Integrate today

    Enhance your applications with top-tier web scraping and crawling capabilities.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    // npm install @mendable/firecrawl-js

    import FirecrawlApp from \\'@mendable/firecrawl-js\\';

    const app = new FirecrawlApp({ apiKey: \"fc-YOUR_API_KEY\" });

    // Scrape a website:
    const scrapeResult = await app.scrapeUrl(\\'firecrawl.dev\\');

    console.log(scrapeResult.data.markdown)

    Use well-known tools

    Already fully integrated with the greatest existing tools and workflows.
    \"LlamaIndex\"\"Langchain\"\"Dify\"\"Dify\"\"Flowise\"\"CrewAI\"

    Start for free, scale easily

    Kick off your journey for free and scale seamlessly as your project expands.

    Open-source

    Developed transparently and collaboratively. Join our community of contributors.

    We handle the hard stuff

    Rotating proxies, caching, rate limits, js-blocked content and more

    Crawling

    Firecrawl crawls all accessible subpages, even without a sitemap.

    Dynamic content

    Firecrawl gathers data even if a website uses javascript to render content.

    To Markdown

    Firecrawl returns clean, well formatted markdown - ready for use in LLM applications

    Crawling Orchestration

    Firecrawl orchestrates the crawling process in parallel for the fastest results.

    Caching

    Firecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.

    Built for AI

    Built by LLM engineers, for LLM engineers. Giving you clean data the way you want it.

    Our wall of love

    Don\\'t take our word for it

    div]:mx-3 animate-infinite-scroll group-hover:[animation-play-state:paused]\">
    \"Greg
    Greg Kamradt
    LLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.
    \"Amit
    Amit Naik
    #llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏
    \"Jerry
    Jerry Liu
    Firecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.
    \"Bardia
    Bardia Pourvakil
    These guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.
    \"latentsauce
    latentsauce 🧘🏽
    Firecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️
    div]:mx-3 animate-infinite-scroll group-hover:[animation-play-state:paused]\" aria-hidden=\"true\">
    \"Greg
    Greg Kamradt
    LLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.
    \"Amit
    Amit Naik
    #llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. 👏
    \"Jerry
    Jerry Liu
    Firecrawl is awesome 🔥 Turns web pages into structured markdown for LLM apps, thanks to @mendableai.
    \"Bardia
    Bardia Pourvakil
    These guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.
    \"latentsauce
    latentsauce 🧘🏽
    Firecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ❤️❤️❤️
    div]:mx-3 animate-infinite-scroll-inverse group-hover:[animation-play-state:paused] [animation-delay:-7.5s]\">
    \"Michael
    Michael Ning
    Firecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.
    \"Alex
    Alex Reibman 🖇️
    Moved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.
    I really like some of the design decisions Firecrawl made, so I really want to share with others.
    \"Paul
    Paul Scott
    Appreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.
    div]:mx-3 animate-infinite-scroll-inverse group-hover:[animation-play-state:paused] [animation-delay:-7.5s]\" aria-hidden=\"true\">
    \"Michael
    Michael Ning
    Firecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.
    \"Alex
    Alex Reibman 🖇️
    Moved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.
    I really like some of the design decisions Firecrawl made, so I really want to share with others.
    \"Paul
    Paul Scott
    Appreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.

    Flexible Pricing

    Start for free, then scale as you grow

    Free Plan

    500 credits

    $0 one-time

    • Scrape 500 pages
    • 10 /scrape per min
    • 1 /crawl per min

    Hobby

    3,000 credits

    $16/month

    Billed annually
    • Scrape 3,000 pages*
    • 20 /scrape per min
    • 3 /crawl per min

    StandardMost Popular

    100,000 credits

    $83/month

    Billed annually
    • Scrape 100,000 pages*
    • 100 /scrape per min
    • 10 /crawl per min
    • 2 seats

    Growth

    500,000 credits

    $333/month

    Billed annually
    • Scrape 500,000 pages*
    • 1000 /scrape per min
    • 50 /crawl per min
    • 4 seats
    • Priority Support

    Enterprise Plan

    Unlimited credits. Custom RPMs.

    • Top priority support
    • Feature Acceleration
    • SLAs
    • Account Manager
    • Custom rate limits volume
    • Custom concurrency limits
    • Custom seats
    • CEO\\'s number

    * a /scrape refers to the scrape API endpoint. Structured extraction costs vary. See credits table.

    * a /crawl refers to the crawl API endpoint.

    API Credits

    Credits are consumed for each API request, varying by endpoint and feature.

    FeaturesCredits
    Scrape(/scrape)1 / page
    Crawl(/crawl)1 / page
    Map (/map)1 / call
    Search(/search)1 / page
    Scrape + LLM extraction (/scrape)5 / page

    Ready to Build?

    Start scraping web data for your AI apps today.
    No credit card needed.

    FAQ

    Frequently asked questions about Firecrawl

    General

    What is Firecrawl?

    Firecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.

    What sites work?

    Firecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.

    Who can benefit from using Firecrawl?

    Firecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.

    Is Firecrawl open-source?

    Yes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.

    Scraping & Crawling

    How does Firecrawl handle dynamic content on websites?

    Unlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.

    Why is it not crawling all the pages?

    There are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at help@firecrawl.com.

    Can Firecrawl crawl websites without a sitemap?

    Yes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.

    What formats can Firecrawl convert web data into?

    Firecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.

    How does Firecrawl ensure the cleanliness of the data?

    Firecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.

    Is Firecrawl suitable for large-scale data scraping projects?

    Absolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.

    Does it respect robots.txt?

    Yes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at help@firecrawl.com.

    What measures does Firecrawl take to handle web scraping challenges like rate limits and caching?

    Firecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.

    Does Firecrawl handle captcha or authentication?

    Firecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.

    API Related

    Where can I find my API key?

    Click on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.

    Billing

    Is Firecrawl free?

    Firecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.

    Is there a pay per use plan instead of monthly?

    No we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.

    How many credit does scraping, crawling, and extraction cost?

    Scraping costs 1 credit per page. Crawling costs 1 credit per page.

    Do you charge for failed requests (scrape, crawl, extract)?

    We do not charge for any failed requests (scrape, crawl, extract). Please contact support at caleb@firecrawl.com if you have any questions.

    What payment methods do you accept?

    We accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.

      ', 'actions': {'screenshots': []}, 'metadata': {'title': 'Home - Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'language': 'en', 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'robots': 'follow, index', 'ogTitle': 'Firecrawl', 'ogDescription': 'Turn any website into LLM-ready data.', 'ogUrl': 'https://www.firecrawl.dev/', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogLocaleAlternate': [], 'ogSiteName': 'Firecrawl', 'sourceURL': 'https://www.firecrawl.dev', 'statusCode': 200}}}\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "payload = {\n", - " \"url\": \"https://www.firecrawl.dev\",\n", - " \"formats\": [\"html\"],\n", - " \"actions\": [{'type': 'click', 'selector': 'a[href=\"https://calendly.com/d/cj83-ngq-knk/meet-firecrawl\"]'}]\n", - " }\n", - "headers = {\n", - " \"Authorization\": f\"Bearer fc-fa95acf54c0e496fbe6b403745f246ab\",\n", - " \"Content-Type\": \"application/json\"\n", - " }\n", - "\n", - "response = requests.post(\"https://api.firecrawl.dev/v1/scrape\", json=payload, headers=headers)\n", - " \n", - "\n", - "scrape_result = response.json() \n", - "print(scrape_result)\n", - "\n", - " \n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/o1_web_crawler_actions/o1_web_crawler_actions.py b/examples/o1_web_crawler_actions/o1_web_crawler_actions.py deleted file mode 100644 index e9d5698..0000000 --- a/examples/o1_web_crawler_actions/o1_web_crawler_actions.py +++ /dev/null @@ -1,271 +0,0 @@ -import os -import json -import requests -from dotenv import load_dotenv -from openai import OpenAI -import re - -# ANSI color codes -class Colors: - CYAN = '\033[96m' - YELLOW = '\033[93m' - GREEN = '\033[92m' - RED = '\033[91m' - MAGENTA = '\033[95m' - BLUE = '\033[94m' - RESET = '\033[0m' - -# Load environment variables -load_dotenv() - -# Retrieve API keys from environment variables -firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") -openai_api_key = os.getenv("OPENAI_API_KEY") - -# Initialize the OpenAI client -client = OpenAI(api_key=openai_api_key) - -# Step 1: Get objective and URL -def get_objective_and_url(): - url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") - objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") - return objective, url - -# Function to get top N pages from a URL using Firecrawl Map API -def get_top_pages(url, search_term, num_pages=3): - try: - print(f"{Colors.YELLOW}Mapping website using the Firecrawl Map API...{Colors.RESET}") - api_url = "https://api.firecrawl.dev/v1/map" - payload = { - "url": url, - "search": search_term, - } - headers = { - "Authorization": f"Bearer {firecrawl_api_key}", - "Content-Type": "application/json" - } - response = requests.post(api_url, json=payload, headers=headers) - if response.status_code == 200: - map_result = response.json() - - if map_result.get('success'): - links = map_result.get('links', []) - top_pages = links[:num_pages] - print(f"{Colors.GREEN}Found {len(links)} links. Using top {num_pages} pages.{Colors.RESET}") - for i, page in enumerate(top_pages, 1): - print(f"{Colors.CYAN}URL {i}: {page}{Colors.RESET}") - return top_pages - else: - print(f"{Colors.RED}Error: Map API request was not successful{Colors.RESET}") - return [] - else: - print(f"{Colors.RED}Error: Received status code {response.status_code} from Map API{Colors.RESET}") - return [] - except Exception as e: - print(f"{Colors.RED}Error encountered during mapping: {str(e)}{Colors.RESET}") - return [] - -# Step 2: Visit a page and get HTML -def visit_page_and_get_html(url, actions): - try: - if actions: - print(f"{Colors.YELLOW}Scraping page: {url} with actions:{Colors.RESET}") - for action in actions: - print(f" - {action}") - else: - print(f"{Colors.YELLOW}Scraping page: {url}{Colors.RESET}") - - payload = { - "url": url, - "formats": ["html"], - "actions": actions - } - headers = { - "Authorization": f"Bearer {firecrawl_api_key}", - "Content-Type": "application/json" - } - - response = requests.post("https://api.firecrawl.dev/v1/scrape", json=payload, headers=headers) - - if response.status_code == 200: - scrape_result = response.json() - html_content = scrape_result["data"]["html"] - if len(actions) > 0: - print("html_content: ", scrape_result) - print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") - - return html_content - else: - print(f"{Colors.RED}Error: Received status code {response.status_code}{Colors.RESET}") - return None - except Exception as e: - print(f"{Colors.RED}Error encountered during page scraping: {str(e)}{Colors.RESET}") - return None - -# Step 3: Process the page to fulfill the objective or decide next action -def process_page(html_content, objective): - try: - process_prompt = f""" -You are an AI assistant helping to achieve the following objective: '{objective}'. -Given the HTML content of a web page, determine if the objective is met. - -Instructions: -1. If the objective is met, respond in JSON format as follows: -{{ - "status": "Objective met", - "data": {{ ... extracted information ... }} -}} - -2. If the objective is not met, analyze the HTML content to decide the best next action to get closer to the objective. Provide the action(s) needed to navigate to the next page or interact with the page. Respond in JSON format as follows: -{{ - "status": "Objective not met", - "actions": [{{ ... actions to perform ... }}] -}} - -3. The actions should be in the format accepted by the 'actions' parameter of the 'scrape_url' function in Firecrawl. Available actions include: - - {{"type": "wait", "milliseconds": }} - Example: {{"type": "wait", "milliseconds": 2000}} - - {{"type": "click", "selector": ""}} - Example: {{"type": "click", "selector": "#load-more-button"}} - - {{"type": "write", "text": "", "selector": ""}} - Example: {{"type": "write", "text": "Hello, world!", "selector": "#search-input"}} - - {{"type": "press", "key": ""}} - Example: {{"type": "press", "key": "Enter"}} - - {{"type": "scroll", "direction": "", "amount": }} - Example: {{"type": "scroll", "direction": "down", "amount": 500}} - -4. Do not include any explanations or additional text outside of the JSON response. - -HTML Content: -{html_content[:20000]} -""" - - completion = client.chat.completions.create( - model="o1-preview", - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": process_prompt - } - ] - } - ] - ) - - response = completion.choices[0].message.content.strip() - - # Remove any JSON code blocks from the response - response = re.sub(r'```json\s*(.*?)\s*```', r'\1', response, flags=re.DOTALL) - - # Parse the response as JSON - try: - result = json.loads(response) - status = result.get('status') - if status == 'Objective met': - data = result.get('data') - return {'result': data} - elif status == 'Objective not met': - actions = result.get('actions') - return {'actions': actions} - else: - print(f"{Colors.RED}Unexpected status in response: {status}{Colors.RESET}") - return {} - except json.JSONDecodeError: - print(f"{Colors.RED}Error parsing assistant's response as JSON.{Colors.RESET}") - print(f"{Colors.RED}Response was: {response}{Colors.RESET}") - return {} - except Exception as e: - print(f"{Colors.RED}Error encountered during processing of the page: {str(e)}{Colors.RESET}") - return {} - -# Function to determine search term based on the objective -def determine_search_term(objective): - try: - prompt = f""" -Based on the following objective: '{objective}', provide a 1-2 word search term that would help find relevant pages on the website. Only respond with the search term and nothing else. -""" - completion = client.chat.completions.create( - model="o1-preview", - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": prompt - } - ] - } - ] - ) - search_term = completion.choices[0].message.content.strip() - print(f"{Colors.GREEN}Determined search term: {search_term}{Colors.RESET}") - return search_term - except Exception as e: - print(f"{Colors.RED}Error determining search term: {str(e)}{Colors.RESET}") - return "" - -# Main function -def main(): - objective, url = get_objective_and_url() - - print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") - - # Determine search term based on objective - search_term = determine_search_term(objective) - if not search_term: - print(f"{Colors.RED}Could not determine a search term based on the objective.{Colors.RESET}") - return - - # Get the top 3 pages using Firecrawl Map API - top_pages = get_top_pages(url, search_term, num_pages=3) - if not top_pages: - print(f"{Colors.RED}No pages found to process.{Colors.RESET}") - return - - for page_url in top_pages: - print(f"{Colors.CYAN}Processing page: {page_url}{Colors.RESET}") - - # Step 2: Visit page and get HTML - html_content = visit_page_and_get_html(page_url, actions=[]) - if not html_content: - print(f"{Colors.RED}Failed to retrieve content from {page_url}{Colors.RESET}") - continue - - # Step 3: Process HTML and objective - action_result = process_page(html_content, objective) - if action_result.get('result'): - print(f"{Colors.GREEN}Objective met. Extracted information:{Colors.RESET}") - print(f"{Colors.MAGENTA}{json.dumps(action_result['result'], indent=2)}{Colors.RESET}") - return - elif action_result.get('actions'): - print(f"{Colors.YELLOW}Objective not met yet. Suggested actions:{Colors.RESET}") - for action in action_result['actions']: - print(f"{Colors.MAGENTA}- {action}{Colors.RESET}") - actions = action_result['actions'] - # Visit the page again with the actions - html_content = visit_page_and_get_html(page_url, actions) - if not html_content: - print(f"{Colors.RED}Failed to retrieve content from {page_url} with actions{Colors.RESET}") - continue - # Process the new HTML - action_result = process_page(html_content, objective) - if action_result.get('result'): - print(f"{Colors.GREEN}Objective met after performing actions. Extracted information:{Colors.RESET}") - print(f"{Colors.MAGENTA}{json.dumps(action_result['result'], indent=2)}{Colors.RESET}") - return - else: - print(f"{Colors.RED}Objective still not met after performing actions on {page_url}{Colors.RESET}") - continue - else: - print(f"{Colors.RED}No actions suggested. Unable to proceed with {page_url}.{Colors.RESET}") - continue - - # If we reach here, the objective was not met on any of the pages - print(f"{Colors.RED}Objective not fulfilled after processing top 3 pages.{Colors.RESET}") - -if __name__ == "__main__": - main() diff --git a/examples/o1_web_crawler_actions/requirements.txt b/examples/o1_web_crawler_actions/requirements.txt deleted file mode 100644 index 249f8be..0000000 --- a/examples/o1_web_crawler_actions/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -firecrawl-py -python-dotenv -openai \ No newline at end of file From 5c4d436f1e8b5ed184777ce0b23bb1252b91d3fd Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 26 Sep 2024 14:46:48 -0400 Subject: [PATCH 3/4] Create o1_job_recommender.py --- .../o1_job_recommender/o1_job_recommender.py | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 examples/o1_job_recommender/o1_job_recommender.py diff --git a/examples/o1_job_recommender/o1_job_recommender.py b/examples/o1_job_recommender/o1_job_recommender.py new file mode 100644 index 0000000..9cc6305 --- /dev/null +++ b/examples/o1_job_recommender/o1_job_recommender.py @@ -0,0 +1,283 @@ +# %% +# %% +import os +import requests +import json +from dotenv import load_dotenv +from openai import OpenAI + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' +# Load environment variables +load_dotenv() + +# Initialize the FirecrawlApp with your API key +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +# Set the jobs page URL +jobs_page_url = "https://openai.com/careers/search" + +# Resume +resume_paste = """" +Eric Ciarla +Co-Founder @ Firecrawl +San Francisco, California, United States +Summary +Building… +Experience +Firecrawl +Co-Founder +April 2024 - Present (6 months) +San Francisco, California, United States +Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by +Amazon, Zapier, and Nvidia (YC S22) +Mendable +2 years 7 months +Co-Founder @ Mendable.ai +March 2022 - Present (2 years 7 months) +San Francisco, California, United States +- Built an AI powered search platform that that served millions of queries for +hundreds of customers (YC S22) +- We were one of the first LLM powered apps adopted by industry leaders like +Coinbase, Snap, DoorDash, and MongoDB +Co-Founder @ SideGuide +March 2022 - Present (2 years 7 months) +San Francisco, California, United States +- Built and scaled an online course platform with a community of over 50,000 +developers +- Selected for Y Combinator S22 batch, 2% acceptance rate +Fracta +Data Engineer +2022 - 2022 (less than a year) +Palo Alto, California, United States +- Demoed tool during sales calls and provided technical support during the +entire customer lifecycle +Page 1 of 2 +- Mined, wrangled, & visualized geospatial and water utility data for predictive +analytics & ML workflows (Python, QGIS) +Ford Motor Company +Data Scientist +2021 - 2021 (less than a year) +Dearborn, Michigan, United States +- Extracted, cleaned, and joined data from multiple sources using SQL, +Hadoop, and Alteryx +- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the +relationships between survey free response verbatim topics (derived from +natural language processing models) and numerical customer experience +scores +MDRemindME +Co-Founder +2018 - 2020 (2 years) +Durham, New Hampshire, United States +- Founded and led a healthtech startup aimed at improving patient adherence +to treatment plans through an innovative engagement and retention tool +- Piloted the product with healthcare providers and patients, gathering critical +insights to refine functionality and enhance user experience +- Secured funding through National Science Foundation I-CORPS Grant and +UNH Entrepreneurship Center Seed Grant +Education +Y Combinator +S22 +University of New Hampshire +Economics and Philosophy +""" + +# First, scrape the jobs page using Firecrawl +try: + response = requests.post( + "https://api.firecrawl.dev/v1/scrape", + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {firecrawl_api_key}" + }, + json={ + "url": jobs_page_url, + "formats": ["markdown"] + } + ) + + if response.status_code == 200: + result = response.json() + if result.get('success'): + html_content = result['data']['markdown'] + # Define the O1 prompt for extracting apply links + prompt = f""" + Extract up to 30 job application links from the given markdown content. + Return the result as a JSON object with a single key 'apply_links' containing an array of strings (the links). + The output should be a valid JSON object, with no additional text. + Do not include any JSON markdown formatting or code block indicators. + Provide only the raw JSON object as the response. + + Example of the expected format: + {{"apply_links": ["https://example.com/job1", "https://example.com/job2", ...]}} + + Markdown content: + {html_content[:100000]} + """ + print(f"{Colors.GREEN}Successfully scraped the jobs page{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to scrape the jobs page: {result.get('message', 'Unknown error')}{Colors.RESET}") + html_content = "" + else: + print(f"{Colors.RED}Error {response.status_code}: {response.text}{Colors.RESET}") + html_content = "" +except requests.RequestException as e: + print(f"{Colors.RED}An error occurred while scraping: {str(e)}{Colors.RESET}") + html_content = "" +except json.JSONDecodeError as e: + print(f"{Colors.RED}Error decoding JSON response: {str(e)}{Colors.RESET}") + html_content = "" +except Exception as e: + print(f"{Colors.RED}An unexpected error occurred while scraping: {str(e)}{Colors.RESET}") + html_content = "" + +# Extract apply links from the scraped HTML using O1 +apply_links = [] +if html_content: + try: + completion = client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": prompt + } + ] + ) + + if completion.choices: + print(completion.choices[0].message.content) + result = json.loads(completion.choices[0].message.content.strip()) + + apply_links = result['apply_links'] + print(f"{Colors.GREEN}Successfully extracted {len(apply_links)} apply links{Colors.RESET}") + else: + print(f"{Colors.RED}No apply links extracted{Colors.RESET}") + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error decoding JSON from OpenAI response: {str(e)}{Colors.RESET}") + except KeyError as e: + print(f"{Colors.RED}Expected key not found in OpenAI response: {str(e)}{Colors.RESET}") + except Exception as e: + print(f"{Colors.RED}An unexpected error occurred during extraction: {str(e)}{Colors.RESET}") +else: + print(f"{Colors.RED}No HTML content to process{Colors.RESET}") + +# Initialize a list to store the extracted data +extracted_data = [] + + +# %% +print(f"{Colors.CYAN}Apply links:{Colors.RESET}") +for link in apply_links: + print(f"{Colors.YELLOW}{link}{Colors.RESET}") + +# %% +# Process each apply link +for index, link in enumerate(apply_links): + try: + response = requests.post( + "https://api.firecrawl.dev/v1/scrape", + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {firecrawl_api_key}" + }, + json={ + "url": link, + "formats": ["extract"], + "actions": [{ + "type": "click", + "selector": "#job-overview" + }], + "extract": { + "schema": { + "type": "object", + "properties": { + "job_title": {"type": "string"}, + "sub_division_of_organization": {"type": "string"}, + "key_skills": {"type": "array", "items": {"type": "string"}}, + "compensation": {"type": "string"}, + "location": {"type": "string"}, + "apply_link": {"type": "string"} + }, + "required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "location", "apply_link"] + } + } + } + ) + + if response.status_code == 200: + result = response.json() + if result.get('success'): + extracted_data.append(result['data']['extract']) + print(f"{Colors.GREEN}Data extracted for job {index}{Colors.RESET}") + else: + print(f"") + else: + print(f"") + except Exception as e: + print(f"") + + +# %% +# %% +# Print the extracted data +print(f"{Colors.CYAN}Extracted data:{Colors.RESET}") +for job in extracted_data: + print(json.dumps(job, indent=2)) + print(f"{Colors.MAGENTA}{'-' * 50}{Colors.RESET}") + + +# %% + + + + +# Use o1-preview to choose which jobs should be applied to based on the resume +prompt = f""" +Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text: + +[ + {{ + "job_title": "Job Title", + "compensation": "Compensation (if available, otherwise empty string)", + "apply_link": "Application URL" + }}, + ... +] + +Based on the following resume: +{resume_paste} + +And the following job listings: +{json.dumps(extracted_data, indent=2)} +""" + +completion = client.chat.completions.create( + model="o1-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt + } + ] + } + ] +) + +recommended_jobs = json.loads(completion.choices[0].message.content.strip()) + +print(f"{Colors.CYAN}Recommended jobs:{Colors.RESET}") +print(json.dumps(recommended_jobs, indent=2)) + + From 20b998e66aba7cb1d2cecb7e386c6cab20575b55 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 26 Sep 2024 14:51:07 -0400 Subject: [PATCH 4/4] Delete o1_job_recommender.ipynb --- .../o1_job_recommender.ipynb | 672 ------------------ 1 file changed, 672 deletions(-) delete mode 100644 examples/o1_job_recommender/o1_job_recommender.ipynb diff --git a/examples/o1_job_recommender/o1_job_recommender.ipynb b/examples/o1_job_recommender/o1_job_recommender.ipynb deleted file mode 100644 index 8827817..0000000 --- a/examples/o1_job_recommender/o1_job_recommender.ipynb +++ /dev/null @@ -1,672 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# %%\n", - "import os\n", - "import datetime\n", - "import time\n", - "import requests\n", - "import json\n", - "from dotenv import load_dotenv\n", - "from firecrawl import FirecrawlApp\n", - "from pydantic import BaseModel, Field\n", - "from typing import List\n", - "\n", - "# Load environment variables\n", - "load_dotenv()\n", - "\n", - "# Retrieve API keys from environment variables\n", - "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", - "\n", - "# Initialize the FirecrawlApp with your API key\n", - "app = FirecrawlApp(api_key=firecrawl_api_key)\n", - "\n", - "# Set the jobs page URL\n", - "jobs_page_url = \"https://openai.com/careers\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total pages mapped (excluding original URL): 14\n", - "['https://openai.com/careers/research-scientist', 'https://openai.com/careers/analytics-engineer', 'https://openai.com/careers/solutions-architect', 'https://openai.com/careers/iam-engineer', 'https://openai.com/careers/talent-partnerships', 'https://openai.com/careers/product-designer', 'https://openai.com/careers/recruiting-coordinator', 'https://openai.com/careers/av-specialist', 'https://openai.com/careers/it-support', 'https://openai.com/careers/director-edu', 'https://openai.com/careers/research-engineer', 'https://openai.com/careers/solutions-engineer', 'https://openai.com/careers/software-engineer-networking', 'https://openai.com/careers/revenue-operations-leader']\n" - ] - } - ], - "source": [ - "# %%\n", - "# Use the Firecrawl Map API to get the sitemap\n", - "api_url = \"https://api.firecrawl.dev/v1/map\"\n", - "payload = {\n", - " \"url\": jobs_page_url,\n", - " \"search\": \"\", # Empty search term to get all pages\n", - " \"limit\": 15\n", - "}\n", - "headers = {\n", - " \"Authorization\": f\"Bearer {firecrawl_api_key}\",\n", - " \"Content-Type\": \"application/json\"\n", - "}\n", - "response = requests.post(api_url, json=payload, headers=headers)\n", - "\n", - "if response.status_code == 200:\n", - " map_result = response.json()\n", - " if map_result.get('success'):\n", - " links = [link for link in map_result.get('links', []) if link != jobs_page_url]\n", - " print(f\"Total pages mapped (excluding original URL): {len(links)}\")\n", - " print(links)\n", - " else:\n", - " print(\"Map API request was not successful\")\n", - " exit(1)\n", - "else:\n", - " print(f\"Error: {response.status_code}\")\n", - " print(response.text)\n", - " exit(1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error 500 for page 0: {\"success\":false,\"error\":\"(Internal server error) - JSON parsing error(s): must be object\\n\\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. - Could be due to LLM parsing issues\"}\n", - "Data extracted for page 1\n", - "Data extracted for page 2\n", - "Data extracted for page 3\n", - "Data extracted for page 4\n", - "Data extracted for page 5\n", - "Data extracted for page 6\n", - "Data extracted for page 7\n", - "Data extracted for page 8\n", - "Data extracted for page 9\n", - "Data extracted for page 10\n", - "Data extracted for page 11\n", - "Data extracted for page 12\n", - "Data extracted for page 13\n" - ] - } - ], - "source": [ - "# %%\n", - "# Define the extraction schema\n", - "extract_schema = {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"job_title\": {\n", - " \"type\": \"string\"\n", - " },\n", - " \"sub_division_of_organization\": {\n", - " \"type\": \"string\"\n", - " },\n", - " \"key_skills\": {\n", - " \"type\": \"array\",\n", - " \"items\": {\n", - " \"type\": \"string\"\n", - " }\n", - " },\n", - " \"compensation\": {\n", - " \"type\": \"string\"\n", - " },\n", - " \"apply_link\": {\n", - " \"type\": \"string\"\n", - " }\n", - " },\n", - " \"required\": [\"job_title\", \"sub_division_of_organization\", \"key_skills\", \"compensation\", \"apply_link\"]\n", - "}\n", - "\n", - "# Initialize a list to store the extracted data\n", - "extracted_data = []\n", - "\n", - "# Process each link in the map result\n", - "for index, link in enumerate(links):\n", - " try:\n", - " response = requests.post(\n", - " \"https://api.firecrawl.dev/v1/scrape\",\n", - " headers={\n", - " \"Content-Type\": \"application/json\",\n", - " \"Authorization\": f\"Bearer {firecrawl_api_key}\"\n", - " },\n", - " json={\n", - " \"url\": link,\n", - " \"formats\": [\"extract\"],\n", - " \"extract\": {\n", - " \"schema\": extract_schema\n", - " }\n", - " }\n", - " )\n", - " \n", - " if response.status_code == 200:\n", - " result = response.json()\n", - " if result.get('success'):\n", - " extracted_data.append(result['data']['extract'])\n", - " print(f\"Data extracted for page {index}\")\n", - " else:\n", - " print(f\"No data extracted for page {index}\")\n", - " else:\n", - " print(f\"Error {response.status_code} for page {index}: {response.text}\")\n", - " except Exception as e:\n", - " print(f\"An error occurred for page {index}: {str(e)}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted data:\n", - "{\n", - " \"job_title\": \"Analytics Engineer\",\n", - " \"sub_division_of_organization\": \"Growth\",\n", - " \"key_skills\": [\n", - " \"SQL\",\n", - " \"Python\",\n", - " \"business intelligence tools\",\n", - " \"ETL workflows\",\n", - " \"data analysis\",\n", - " \"dashboards\",\n", - " \"data storytelling\"\n", - " ],\n", - " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Solutions Architect\",\n", - " \"sub_division_of_organization\": \"Technical Success\",\n", - " \"key_skills\": [\n", - " \"technical consulting\",\n", - " \"Generative AI\",\n", - " \"ML solutions\",\n", - " \"network architecture\",\n", - " \"cloud architecture\",\n", - " \"Python\",\n", - " \"Javascript\"\n", - " ],\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"IAM Engineer\",\n", - " \"sub_division_of_organization\": \"IT\",\n", - " \"key_skills\": [\n", - " \"AzureAD\",\n", - " \"Python\",\n", - " \"PowerShell\",\n", - " \"identity governance\",\n", - " \"automation\",\n", - " \"Terraform\"\n", - " ],\n", - " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/e798aa62-74f9-4f53-a890-716310926b70/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Talent Partnerships\",\n", - " \"sub_division_of_organization\": \"Communications\",\n", - " \"key_skills\": [\n", - " \"relationship management\",\n", - " \"communication\",\n", - " \"adaptability\",\n", - " \"creativity\",\n", - " \"collaboration\",\n", - " \"transparency\"\n", - " ],\n", - " \"compensation\": \"$171K \\u2013 $240K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/84a4a8bb-7d5a-4989-9b5c-bd841db2698e/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"404 Error Page\",\n", - " \"sub_division_of_organization\": \"Web Development\",\n", - " \"key_skills\": [\n", - " \"Error Handling\",\n", - " \"Web Design\",\n", - " \"User Experience\"\n", - " ],\n", - " \"compensation\": \"N/A\",\n", - " \"apply_link\": \"N/A\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"\",\n", - " \"sub_division_of_organization\": \"\",\n", - " \"key_skills\": [],\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"AV Specialist\",\n", - " \"sub_division_of_organization\": \"IT\",\n", - " \"key_skills\": [\n", - " \"AV support\",\n", - " \"Google Meet\",\n", - " \"Zoom\",\n", - " \"Cisco\",\n", - " \"ticket management\",\n", - " \"IT troubleshooting\",\n", - " \"problem-solving\",\n", - " \"interpersonal skills\"\n", - " ],\n", - " \"compensation\": \"$110K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/20fd0ff8-dd5e-4bec-a401-dd3f8263fe24/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"IT Support\",\n", - " \"sub_division_of_organization\": \"IT\",\n", - " \"key_skills\": [\n", - " \"Intermediate-to-expert understanding of IDP and MDM solutions\",\n", - " \"Familiarity with Windows or Linux\",\n", - " \"Understanding of Python, Bash, or Apple Script\",\n", - " \"Experience with collaboration software\",\n", - " \"Hands-on expertise implementing and managing AV and telecom systems\",\n", - " \"Complete Mac and macOS troubleshooting skills\",\n", - " \"Adept in orchestrating high-production events\"\n", - " ],\n", - " \"compensation\": \"$110K \\u2013 $140K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/ca263679-08d5-4492-9a56-32fbcb7318a5/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"404\",\n", - " \"sub_division_of_organization\": \"OpenAI\",\n", - " \"key_skills\": [],\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Research Engineer\",\n", - " \"sub_division_of_organization\": \"Research\",\n", - " \"key_skills\": [\n", - " \"strong programming skills\",\n", - " \"experience working in large distributed systems\"\n", - " ],\n", - " \"compensation\": \"$295K \\u2013 $440K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/240d459b-696d-43eb-8497-fab3e56ecd9b/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Solutions Engineer\",\n", - " \"sub_division_of_organization\": \"Technical Success\",\n", - " \"key_skills\": [\n", - " \"7+ years of experience in a technical pre-sales role\",\n", - " \"Understanding of IT security principles\",\n", - " \"Experience with programming languages like Python or Javascript\",\n", - " \"Knowledge of network/cloud architecture\",\n", - " \"Effective presentation and communication skills\",\n", - " \"Ability to manage C-level technical and business relationships\"\n", - " ],\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/dbfef1b0-9a77-46bd-ad36-67f3d0286924/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Software Engineer, Networking\",\n", - " \"sub_division_of_organization\": \"Platform\",\n", - " \"key_skills\": [\n", - " \"C++\",\n", - " \"CUDA\",\n", - " \"distributed algorithms\",\n", - " \"RDMA\",\n", - " \"network simulation techniques\"\n", - " ],\n", - " \"compensation\": \"$360K \\u2013 $530K\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340c0c22-8d8f-4232-b17e-f642b64c25c3/application\"\n", - "}\n", - "--------------------------------------------------\n", - "{\n", - " \"job_title\": \"Revenue Operations Leader\",\n", - " \"sub_division_of_organization\": \"Revenue Operations\",\n", - " \"key_skills\": [\n", - " \"Extensive experience in revenue operations or strategy at a high-growth, technology company\",\n", - " \"Proficiency with GTM systems, namely SFDC, Gong\",\n", - " \"Experience managing a large team of 15+ operational team members\",\n", - " \"Highly analytical\",\n", - " \"Exceptional project management skills with experience leading complex, cross-functional initiatives\",\n", - " \"Deep experience designing & executing on a territory strategy for 100+ GTM orgs\",\n", - " \"Strong communication skills and executive presence\",\n", - " \"An understanding of the AI landscape, our applications, and the problems they solve for our customers\",\n", - " \"The ability to thrive in ambiguity and work autonomously\"\n", - " ],\n", - " \"compensation\": \"$325K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/61a484e5-4723-4031-92c1-068dfe4b069f/application\"\n", - "}\n", - "--------------------------------------------------\n", - "Extracted data saved to /Users/ericciarla/Documents/GitHub/firecrawl/examples/getting_latest_openai_jobs/openai_jobs.csv\n" - ] - } - ], - "source": [ - "# %%\n", - "# Print the extracted data\n", - "print(\"Extracted data:\")\n", - "for job in extracted_data:\n", - " print(json.dumps(job, indent=2))\n", - " print(\"-\" * 50) # Separator between jobs\n", - "\n", - "# Save as CSV\n", - "import csv\n", - "import os\n", - "\n", - "# Get the current directory\n", - "current_dir = os.getcwd()\n", - "\n", - "# Create the full path for the CSV file\n", - "csv_file = os.path.join(current_dir, \"openai_jobs.csv\")\n", - "\n", - "try:\n", - " with open(csv_file, \"w\", newline=\"\") as f:\n", - " if extracted_data:\n", - " writer = csv.DictWriter(f, fieldnames=extracted_data[0].keys())\n", - " writer.writeheader()\n", - " for job in extracted_data:\n", - " writer.writerow(job)\n", - " print(f\"Extracted data saved to {csv_file}\")\n", - " else:\n", - " print(\"No data to save.\")\n", - "except IOError as e:\n", - " print(f\"Error saving CSV file: {e}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recommended jobs:\n", - "[\n", - " {\n", - " \"job_title\": \"Analytics Engineer\",\n", - " \"compensation\": \"$245K \\u2013 $385K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application\"\n", - " },\n", - " {\n", - " \"job_title\": \"Solutions Architect\",\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application\"\n", - " },\n", - " {\n", - " \"job_title\": \"Research Engineer\",\n", - " \"compensation\": \"$295K \\u2013 $440K + Offers Equity\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/240d459b-696d-43eb-8497-fab3e56ecd9b/application\"\n", - " },\n", - " {\n", - " \"job_title\": \"Solutions Engineer\",\n", - " \"compensation\": \"\",\n", - " \"apply_link\": \"https://jobs.ashbyhq.com/openai/dbfef1b0-9a77-46bd-ad36-67f3d0286924/application\"\n", - " }\n", - "]\n" - ] - } - ], - "source": [ - "from openai import OpenAI\n", - "\n", - "# Resume\n", - "resume_paste = \"\"\"\"\n", - "Eric Ciarla\n", - "Co-Founder @ Firecrawl\n", - "San Francisco, California, United States\n", - "Summary\n", - "Building…\n", - "Experience\n", - "Firecrawl\n", - "Co-Founder\n", - "April 2024 - Present (6 months)\n", - "San Francisco, California, United States\n", - "Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by\n", - "Amazon, Zapier, and Nvidia (YC S22)\n", - "Mendable\n", - "2 years 7 months\n", - "Co-Founder @ Mendable.ai\n", - "March 2022 - Present (2 years 7 months)\n", - "San Francisco, California, United States\n", - "- Built an AI powered search platform that that served millions of queries for\n", - "hundreds of customers (YC S22)\n", - "- We were one of the first LLM powered apps adopted by industry leaders like\n", - "Coinbase, Snap, DoorDash, and MongoDB\n", - "Co-Founder @ SideGuide\n", - "March 2022 - Present (2 years 7 months)\n", - "San Francisco, California, United States\n", - "- Built and scaled an online course platform with a community of over 50,000\n", - "developers\n", - "- Selected for Y Combinator S22 batch, 2% acceptance rate\n", - "Fracta\n", - "Data Engineer\n", - "2022 - 2022 (less than a year)\n", - "Palo Alto, California, United States\n", - "- Demoed tool during sales calls and provided technical support during the\n", - "entire customer lifecycle\n", - "Page 1 of 2\n", - "- Mined, wrangled, & visualized geospatial and water utility data for predictive\n", - "analytics & ML workflows (Python, QGIS)\n", - "Ford Motor Company\n", - "Data Scientist\n", - "2021 - 2021 (less than a year)\n", - "Dearborn, Michigan, United States\n", - "- Extracted, cleaned, and joined data from multiple sources using SQL,\n", - "Hadoop, and Alteryx\n", - "- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the\n", - "relationships between survey free response verbatim topics (derived from\n", - "natural language processing models) and numerical customer experience\n", - "scores\n", - "MDRemindME\n", - "Co-Founder\n", - "2018 - 2020 (2 years)\n", - "Durham, New Hampshire, United States\n", - "- Founded and led a healthtech startup aimed at improving patient adherence\n", - "to treatment plans through an innovative engagement and retention tool\n", - "- Piloted the product with healthcare providers and patients, gathering critical\n", - "insights to refine functionality and enhance user experience\n", - "- Secured funding through National Science Foundation I-CORPS Grant and\n", - "UNH Entrepreneurship Center Seed Grant\n", - "Education\n", - "Y Combinator\n", - "S22\n", - "University of New Hampshire\n", - "Economics and Philosophy\n", - "\"\"\"\n", - "\n", - "# Use o1-preview to choose which jobs should be applied to based on the resume\n", - "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", - "\n", - "prompt = f\"\"\"\n", - "Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:\n", - "\n", - "[\n", - " {{\n", - " \"job_title\": \"Job Title\",\n", - " \"compensation\": \"Compensation (if available, otherwise empty string)\",\n", - " \"apply_link\": \"Application URL\"\n", - " }},\n", - " ...\n", - "]\n", - "\n", - "Based on the following resume:\n", - "{resume_paste}\n", - "\n", - "And the following job listings:\n", - "{json.dumps(extracted_data, indent=2)}\n", - "\"\"\"\n", - "\n", - "completion = client.chat.completions.create(\n", - " model=\"o1-preview\",\n", - " messages=[\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": prompt\n", - " }\n", - " ]\n", - " }\n", - " ]\n", - ")\n", - "\n", - "recommended_jobs = json.loads(completion.choices[0].message.content.strip())\n", - "\n", - "print(\"Recommended jobs:\")\n", - "print(json.dumps(recommended_jobs, indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# scrape each of the apply links with firecrawl /v1/scrape\n", - "import requests\n", - "\n", - "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", - "\n", - "def scrape_apply_link(url):\n", - " api_url = \"https://api.firecrawl.dev/v1/scrape\"\n", - " headers = {\n", - " \"Authorization\": f\"Bearer {firecrawl_api_key}\",\n", - " \"Content-Type\": \"application/json\"\n", - " }\n", - " payload = {\n", - " \"url\": url\n", - " }\n", - " \n", - " response = requests.post(api_url, json=payload, headers=headers)\n", - " if response.status_code == 200:\n", - " return response.json()\n", - " else:\n", - " print(f\"Error scraping {url}: {response.status_code}\")\n", - " return None\n", - "\n", - "scraped_job_data = []\n", - "for job in recommended_jobs:\n", - " apply_link = job.get('apply_link')\n", - " if apply_link:\n", - " scraped_data = scrape_apply_link(apply_link)\n", - " if scraped_data:\n", - " scraped_job_data.append({\n", - " 'job_title': job['job_title'],\n", - " 'compensation': job['compensation'],\n", - " 'apply_link': apply_link,\n", - " 'scraped_content': scraped_data\n", - " })\n", - "\n", - "print(f\"Scraped {len(scraped_job_data)} job application pages\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use o1 to write the application for you and return in json\n", - "import json\n", - "\n", - "\n", - "def generate_application(job_data, resume_paste):\n", - " # Extract relevant information from scraped content\n", - " scraped_text = job_data['scraped_content'].get('text', '')\n", - " \n", - " prompt = f\"\"\"\n", - " Based on the following job information, scraped content from the application page, and the provided resume, write a tailored job application:\n", - "\n", - " Job Title: {job_data['job_title']}\n", - " Compensation: {job_data['compensation']}\n", - " Scraped Content: {scraped_text[:1000]} # Limit to first 1000 characters to avoid token limits\n", - "\n", - " Resume:\n", - " {resume_paste}\n", - "\n", - " Please format the application as a JSON object with the following fields:\n", - " - cover_letter: A personalized cover letter addressing key points from the scraped content and highlighting relevant experience from the resume\n", - " - resume_highlights: Key points from the resume that align with the job requirements mentioned in the scraped content\n", - " - questions: Any questions you have about the position, derived from the available information\n", - "\n", - " Ensure the content is specifically tailored to the information provided in the scraped content and leverages the experience detailed in the resume.\n", - " \"\"\"\n", - "\n", - " try:\n", - " completion = client.chat.completions.create(\n", - " model=\"o1-preview\",\n", - " messages=[\n", - " \n", - " {\"role\": \"user\", \"content\": prompt}\n", - " ]\n", - " )\n", - " return json.loads(completion.choices[0].message.content)\n", - " except Exception as e:\n", - " print(f\"Error generating application: {str(e)}\")\n", - " return None\n", - "\n", - "\n", - "\n", - "applications = []\n", - "for job in scraped_job_data:\n", - " application = generate_application(job, resume_paste)\n", - " if application:\n", - " applications.append({\n", - " \"job_title\": job[\"job_title\"],\n", - " \"apply_link\": job[\"apply_link\"],\n", - " \"application\": application\n", - " })\n", - "\n", - "print(f\"Generated {len(applications)} job applications based on scraped content and resume\")\n", - "print(json.dumps(applications, indent=2))\n", - "\n", - "# Save the JSON to a file\n", - "output_file = \"generated_applications.json\"\n", - "with open(output_file, \"w\") as f:\n", - " json.dump(applications, f, indent=2)\n", - "\n", - "print(f\"Saved generated applications to {output_file}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}