mirror of
https://github.com/stackblitz/bolt.new
synced 2025-06-26 18:17:50 +00:00
adding cv to repo
This commit is contained in:
parent
81ba76ff2f
commit
a1683951a0
1640
clockwork_viking/api_latest.py
Normal file
1640
clockwork_viking/api_latest.py
Normal file
File diff suppressed because it is too large
Load Diff
9
clockwork_viking/readme.txt
Normal file
9
clockwork_viking/readme.txt
Normal file
@ -0,0 +1,9 @@
|
||||
Example Prompt for V0:
|
||||
|
||||
Can you help me build a simple UI to interface with an API I have? The app should do the following:
|
||||
|
||||
give the user a way to send in files, and send those files to an api endpoint as a POST request, to "http://localhost:8000/receive_files/". The input data should be called "filenames" and should be a list of List[UploadFile] objects as used by FastAPI. The other input should be called "object_names", and be the string "raw_files".
|
||||
|
||||
Run the following API functions in the same location once the files are input: extract_text_from_files(object_name='raw_files', new_object_name='extracted_text'), extract_details_from_text(object_name='extracted_text', new_object_name='contact_info', extract_elements=['name', 'organization', 'phone number', 'email'])
|
||||
|
||||
Every 15 seconds, try to POST a variable called object_name='contact_info' to the endpoint http://localhost:8000/return_data/. Once a 200 result is gotten from the endpoint, show the value of the returned object to the user.
|
2
clockwork_viking/run_api_latest.sh
Executable file
2
clockwork_viking/run_api_latest.sh
Executable file
@ -0,0 +1,2 @@
|
||||
# uvicorn api:app --reload
|
||||
python -m uvicorn api_latest:app --reload --host 0.0.0.0 --port 5000
|
41
clockwork_viking/setup_mongodb.sh
Executable file
41
clockwork_viking/setup_mongodb.sh
Executable file
@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Set your database and collection names
|
||||
DATABASE_NAME="pipeline_db"
|
||||
COLLECTION_NAME="data_objects"
|
||||
|
||||
echo "Updating package list..."
|
||||
sudo apt-get update
|
||||
|
||||
echo "Installing gnupg and curl if not present..."
|
||||
sudo apt-get install -y gnupg curl
|
||||
|
||||
echo "Importing MongoDB public GPG Key..."
|
||||
curl -fsSL https://pgp.mongodb.com/server-7.0.asc | \
|
||||
sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg \
|
||||
--dearmor
|
||||
|
||||
echo "Creating MongoDB source list..."
|
||||
|
||||
echo "deb [ signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | \
|
||||
sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list
|
||||
echo "Updating packages again..."
|
||||
sudo apt-get update
|
||||
|
||||
echo "Installing MongoDB..."
|
||||
sudo apt-get install -y mongodb-org
|
||||
|
||||
echo "Starting MongoDB..."
|
||||
sudo systemctl start mongod
|
||||
sudo systemctl enable mongod
|
||||
|
||||
echo "Waiting for MongoDB to start..."
|
||||
sleep 5
|
||||
|
||||
echo "Creating database '$DATABASE_NAME' and collection '$COLLECTION_NAME'..."
|
||||
mongosh <<EOF
|
||||
use $DATABASE_NAME
|
||||
db.createCollection("$COLLECTION_NAME")
|
||||
EOF
|
||||
|
||||
echo "MongoDB is running at mongodb://localhost:27017"
|
4
clockwork_viking/source_me_first.sh
Executable file
4
clockwork_viking/source_me_first.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#! /bin/bash
|
||||
virtualenv -p python3.12 venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
240
clockwork_viking/url_scraper.py
Normal file
240
clockwork_viking/url_scraper.py
Normal file
@ -0,0 +1,240 @@
|
||||
from __future__ import print_function
|
||||
import requests
|
||||
import pandas as pd
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import csv
|
||||
# import argparse
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import NavigableString
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import Tag
|
||||
from urllib.parse import urljoin
|
||||
|
||||
# Set up command-line argument parsing
|
||||
# parser = argparse.ArgumentParser(description='Search for keywords in a file with context.')
|
||||
# parser.add_argument('input_file', type=str, help='Path to the input JSON file')
|
||||
|
||||
# Parse the arguments
|
||||
# args = parser.parse_args()
|
||||
|
||||
class MyBeautifulSoup(BeautifulSoup):
|
||||
def _all_strings_plus( self, strip=True, types=NavigableString,
|
||||
aRef={'a': lambda a: f"<{a.get('href', '')}>"},
|
||||
skipTags=['script', 'style'] ):
|
||||
# verify types
|
||||
if hasattr(types,'__iter__') and not isinstance(types,type):
|
||||
types = tuple([t for t in types if isinstance(t, type)])
|
||||
if not (types and isinstance(types,(type,tuple))): types = NavigableString
|
||||
|
||||
# skip text in tags included in aRef
|
||||
# skipTags += list(aRef.keys())
|
||||
|
||||
for descendant in self.descendants:
|
||||
# yield extra strings according to aRef
|
||||
if isinstance(descendant, Tag) and descendant.name in aRef:
|
||||
extraStr = aRef[descendant.name](descendant)
|
||||
if isinstance(extraStr, str): yield extraStr
|
||||
|
||||
# skip text nodes DIRECTLY inside a Tag in aRef
|
||||
# if descendant.parent.name in aRef: continue
|
||||
|
||||
# skip ALL text nodes inside skipTags
|
||||
if skipTags and descendant.find_parent(skipTags): continue
|
||||
|
||||
# default behavior
|
||||
if not isinstance(descendant, types): continue
|
||||
|
||||
if strip:
|
||||
descendant = descendant.strip()
|
||||
if len(descendant) == 0: continue
|
||||
yield descendant
|
||||
|
||||
def get_text_plus(self, separator=" ", srcUrl=None, **aspArgs):
|
||||
if srcUrl and isinstance(srcUrl, str):
|
||||
def hrefStr(aTag):
|
||||
href = aTag.get('href')
|
||||
if not (href is None or href.startswith('javascript:')):
|
||||
return f"<{urljoin(srcUrl, href)}>"
|
||||
aspArgs.setdefault('aRef', {})
|
||||
aspArgs['aRef']['a'] = hrefStr
|
||||
|
||||
return separator.join(self._all_strings_plus(**aspArgs))
|
||||
|
||||
|
||||
|
||||
class URLScraper:
|
||||
|
||||
@staticmethod
|
||||
def open_page(url):
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless") # Run in headless mode to capture full page
|
||||
#chrome_options.add_argument("window-size=6709,5337")
|
||||
#chrome_options.add_argument("window-size=1920,5337")
|
||||
# Initialize the Chrome WebDriver with the specified options
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
# Open the URL in the browser
|
||||
driver.get(url)
|
||||
#time.sleep(3) # Wait for the page to load (you can adjust the waiting time if needed)
|
||||
#driver.get(url)
|
||||
soup = MyBeautifulSoup(driver.page_source, "html.parser")
|
||||
print('TYPE OF SOUP:')
|
||||
print(type(soup))
|
||||
return soup
|
||||
|
||||
@staticmethod
|
||||
def get_visible_text_and_links(url, use_selenium = False):
|
||||
if use_selenium:
|
||||
print('OPENING PAGE:')
|
||||
soup = open_page(url)
|
||||
print(type(soup))
|
||||
|
||||
# Regardless, Make a GET request to the URL to get the base url:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
# Expand relative links to absolute URLs
|
||||
base_url = response.url
|
||||
|
||||
# Check if the request was successful (status code 200)
|
||||
if use_selenium or response.status_code == 200:
|
||||
# Parse the HTML content of the page
|
||||
if use_selenium == False:
|
||||
soup = MyBeautifulSoup(response.text.encode('ascii', 'ignore').decode('ascii'), 'html.parser')
|
||||
|
||||
# Extract visible text
|
||||
visible_text = soup.body.get_text(' ', strip=True)
|
||||
|
||||
else:
|
||||
visible_text = soup.get_text_plus(srcUrl=base_url)# ' '.join(soup.stripped_strings) #' '.join(soup.stripped_strings)
|
||||
#print(visible_text)
|
||||
return visible_text
|
||||
|
||||
links = {link.get('href'): {'url': urljoin(base_url, link.get('href')), 'index': visible_text.find(link.text.strip())+len(link.text.lstrip()),'link_text':link.text.strip()} for link in soup.find_all('a') if visible_text.find(link.text.strip())>0}
|
||||
|
||||
link_replacements = []
|
||||
for link, info in links.items():
|
||||
#print(f"Link: {link}, URL: {info['url']}, Index: {info['index']}")
|
||||
if link is not None and info['index']>=0:
|
||||
link_replacements.append([info['index'],link])
|
||||
|
||||
link_replacements = sorted(link_replacements, key=lambda x: x[0],reverse=True)
|
||||
for linkrep in link_replacements:
|
||||
# NOTE: Turning off link replacement for this task
|
||||
visible_text = visible_text[:linkrep[0]] + ' (' + linkrep[1] + ') ' + visible_text[linkrep[0]:]
|
||||
# Output the text with expanded links
|
||||
output = f"\n{visible_text}"
|
||||
return output
|
||||
|
||||
else:
|
||||
return f'Failed to retrieve the page. Status code: {response.status_code}'
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def extract_with_spacing(in_str: str, find_str: str, spacing: int) -> list:
|
||||
results = []
|
||||
find_len = len(find_str)
|
||||
|
||||
idx = 0
|
||||
while idx < len(in_str):
|
||||
idx = in_str.find(find_str, idx)
|
||||
if idx == -1:
|
||||
break
|
||||
|
||||
# Calculate start and end indices with spacing
|
||||
start = max(0, idx - spacing)
|
||||
end = min(len(in_str), idx + find_len + spacing)
|
||||
|
||||
results.append(in_str[start:end])
|
||||
idx += 1 # move forward to find overlapping matches if any
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@staticmethod
|
||||
def create_keyword_results_csv(filename='keyword_results.csv'):
|
||||
# Open the file in write mode initially
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
# Create a CSV writer object
|
||||
csv_writer = csv.writer(csvfile)
|
||||
|
||||
# Write the header row
|
||||
csv_writer.writerow(['Scraped_URL', 'Company_Name', 'Website', 'Scraped_URLs', 'Page_Text'])
|
||||
|
||||
return filename
|
||||
|
||||
@staticmethod
|
||||
def create_error_results_csv(filename='error_results.csv'):
|
||||
# Open the file in write mode initially
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
# Create a CSV writer object
|
||||
csv_writer = csv.writer(csvfile)
|
||||
|
||||
# Write the header row
|
||||
csv_writer.writerow(['Scraped_URL', 'Company_Name', 'Website', 'Scraped_URLs', 'Page_Text'])
|
||||
|
||||
return filename
|
||||
|
||||
@staticmethod
|
||||
def append_keyword_result(filename, url_to_scrape, keyword, example):
|
||||
# Open the file in append mode
|
||||
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
# Create a CSV writer object
|
||||
csv_writer = csv.writer(csvfile)
|
||||
|
||||
# Write the data row
|
||||
csv_writer.writerow([url_to_scrape, keyword, example])
|
||||
|
||||
@staticmethod
|
||||
def append_error_result(filename, url_to_scrape, page_text,company_name, base_url, pip_urls):
|
||||
# Open the file in append mode
|
||||
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
# Create a CSV writer object
|
||||
csv_writer = csv.writer(csvfile)
|
||||
|
||||
# Write the data row
|
||||
csv_writer.writerow([url_to_scrape, company_name, base_url, pip_urls, page_text])
|
||||
|
||||
@staticmethod
|
||||
def append_good_result(filename, url_to_scrape, page_text, company_name, base_url, pip_urls):
|
||||
# Open the file in append mode
|
||||
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
# Create a CSV writer object
|
||||
csv_writer = csv.writer(csvfile)
|
||||
|
||||
# Write the data row
|
||||
csv_writer.writerow([url_to_scrape, company_name, base_url, pip_urls, page_text])
|
||||
|
||||
# input_data = pd.read_json(args.input_file).to_dict(orient='records')
|
||||
# # print(f"Input Data: {input_data}")
|
||||
# # print(f"{type(input_data)}")
|
||||
|
||||
# results_file = create_keyword_results_csv('./scraped_data/' + 'scraped_results.csv')
|
||||
# errors_file = create_error_results_csv('./scraped_data/' + 'scraped_errors.csv')
|
||||
|
||||
# for company in input_data:
|
||||
# urls = list(company.get("People-info page URL"))
|
||||
# company_name = str(company.get("Company Name"))
|
||||
# base_url = str(company.get("Base URL"))
|
||||
# pip_urls = str(company.get("People-info page URL"))
|
||||
# print(f"Working on company: {company_name}")
|
||||
# # urls = list(pd.read_json(args.input_file)['Address'])
|
||||
|
||||
# for url_to_scrape in urls:
|
||||
# try:
|
||||
# web_page_text = '\n\n' + get_visible_text_and_links(url_to_scrape,use_selenium = False)
|
||||
# if (web_page_text is None) or (len(web_page_text) < 50):
|
||||
# print(f'\n\nSomething messed up! Possible too-short page. The web_page_text was: {web_page_text}\n\n')
|
||||
# append_error_result(errors_file, url_to_scrape, web_page_text, company_name, base_url, pip_urls)
|
||||
# else:
|
||||
# print(f'Appending good result: {(results_file, url_to_scrape, web_page_text, company_name, base_url, pip_urls)}')
|
||||
# append_good_result(results_file, url_to_scrape, web_page_text, company_name, base_url, pip_urls)
|
||||
|
||||
# except Exception as e:
|
||||
# print(f'Error: {e}')
|
||||
# append_error_result(errors_file, url_to_scrape, web_page_text, company_name, base_url, pip_urls)
|
Loading…
Reference in New Issue
Block a user