Keyword Power Tool
SBS Keyword Alignment
Enter your UEI to compare SBS, Capability Statement, Website, and LinkedIn keywords.
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from typing import List, Dict, Optional
import re, os, io, json, asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# PDF
import fitz # PyMuPDF
# NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP
# Headless browser for SBS (Playwright)
from playwright.async_api import async_playwright
app = FastAPI(title="SBS Keyword Alignment API")
LINKEDIN_API = os.getenv("LINKEDIN_API") # e.g., Scrapingdog key
SOURCE = ["sbs", "capstmt", "website", "linkedin"]
class CompareOut(BaseModel):
company_name: Optional[str]
uei: Optional[str]
website: Optional[str]
linkedin_url: Optional[str]
keywords_matrix: List[Dict] # {"keyword": str, "sbs": bool, "capstmt": bool, "website": bool, "linkedin": bool}
gaps: Dict[str, List[str]] # per source missing
raw: Dict[str, str] # raw extracted text per source
# ---------- Helpers ----------
def normalize_terms(terms: List[str]) -> List[str]:
# Lowercase, strip, common merges (cyber security->cybersecurity, it -> IT)
out = []
for t in terms:
k = re.sub(r"\s+", " ", t.strip().lower())
k = k.replace("cyber security", "cybersecurity")
k = k.replace("a.i.", "ai").replace("machine-learning", "machine learning")
out.append(k)
return out
def keywordize(text: str, nlp) -> List[str]:
if not text:
return []
text = re.sub(r"\s+", " ", text)
doc = nlp(text)
phrases = set()
# 1) Keep explicit noun chunks
for np in doc.noun_chunks:
s = np.text.strip()
if len(s) >= 3: phrases.add(s)
# 2) Add individual nouns/proper nouns
for tok in doc:
if tok.pos_ in {"NOUN", "PROPN"} and tok.text.lower() not in SPACY_STOP and len(tok.text) > 2:
phrases.add(tok.lemma_.strip())
# Basic cleaning
ph = normalize_terms(list(phrases))
# Remove generic words
generic = {"company","business","solution","service","services","provider","firm"}
ph = [p for p in ph if p not in generic]
# Merge trivial plurals
ph = list({re.sub(r"s$","", p) for p in ph})
return sorted(ph)
async def fetch_sbs_by_uei(uei: str) -> Dict[str, str]:
# Uses Playwright to open SBS, search UEI, open profile, extract fields
out = {"company_name":"","capabilities":"","sbs_keywords":"","website":"","capstmt_url":"","past_perf":""}
async with async_playwright() as pw:
browser = await pw.chromium.launch()
ctx = await browser.new_context()
page = await ctx.new_page()
await page.goto("https://search.certifications.sba.gov/", wait_until="load")
# Type UEI into main search box (uses placeholder 'Search by business name, UEI, CAGE...')
search_sel = "input[placeholder*='Search']"
await page.fill(search_sel, uei)
await page.keyboard.press("Enter")
await page.wait_for_selector("text=Results")
# Click first matching row (assumes unique UEI)
await page.click("tr:has-text('" + uei + "')")
# Wait profile pane
await page.wait_for_selector("text=Capabilities Narrative")
# Extract fields by visible labels
def grab(label):
return page.locator(f"xpath=//div[.//text()[contains(., '{label}')]]").inner_text()
try:
out["company_name"] = await page.locator("h1, h2").first.inner_text()
except:
pass
try:
cap = await page.get_by_text("Capabilities Narrative").locator("xpath=following::div[1]").inner_text()
out["capabilities"] = cap
except:
pass
try:
kw = await page.get_by_text("Keywords").locator("xpath=following::div[1]").inner_text()
out["sbs_keywords"] = kw
except:
pass
try:
web = await page.get_by_text("WWW Page").locator("xpath=following::a[1]").get_attribute("href")
out["website"] = web or ""
except:
pass
# Capability Statement link may appear as a labeled link
try:
caplnk = await page.get_by_text("Capability Statement").locator("xpath=following::a[1]").get_attribute("href")
out["capstmt_url"] = caplnk or ""
except:
pass
# Past Performance/References label
try:
pp = await page.get_by_text("Performance History").locator("xpath=following::div[1]").inner_text()
out["past_perf"] = pp
except:
pass
await browser.close()
return out
async def fetch_text(url: str) -> str:
async with httpx.AsyncClient(timeout=20) as client:
r = await client.get(url, follow_redirects=True, headers={"User-Agent":"Mozilla/5.0"})
r.raise_for_status()
ct = r.headers.get("content-type","")
if "pdf" in ct or url.lower().endswith(".pdf"):
# PDF -> text
with fitz.open(stream=r.content, filetype="pdf") as doc:
parts = []
for p in doc:
parts.append(p.get_text())
return "\n".join(parts)
# HTML
soup = BeautifulSoup(r.text, "html.parser")
# remove scripts/styles/nav
for tag in soup(["script","style","nav","footer","header"]):
tag.decompose()
text = "\n".join(t.get_text(" ", strip=True) for t in soup.find_all(["h1","h2","h3","p","li"]))
return re.sub(r"\s+"," ", text)
async def discover_capstmt(site_url: str) -> Optional[str]:
# Try common paths and in-page links
candidates = ["/capability-statement.pdf","/CapabilityStatement.pdf","/capabilities.pdf","/resources/capability-statement.pdf"]
base = site_url.rstrip('/')
async with httpx.AsyncClient(timeout=15) as client:
# home page scan for PDF links containing 'capab'
try:
r = await client.get(base, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.find_all("a", href=True):
href = a["href"]
if href.lower().endswith(".pdf") and "capab" in href.lower():
return urljoin(base+"/", href)
except:
pass
for p in candidates:
try:
rr = await client.head(base + p)
if rr.status_code == 200:
return base + p
except:
continue
return None
async def fetch_linkedin_company(link: str) -> str:
if not LINKEDIN_API:
return ""
# Example Scrapingdog
api = f"https://api.scrapingdog.com/linkedin?api_key={LINKEDIN_API}&type=company&link={link}"
async with httpx.AsyncClient(timeout=30) as client:
r = await client.get(api)
if r.status_code != 200:
return ""
data = r.json()
# best-effort fields: 'about', 'description', 'specialities'
about = data.get("about") or data.get("description") or ""
specs = ", ".join(data.get("specialities", []) or [])
return about + ("\nSpecialties: " + specs if specs else "")
@app.get("/compare", response_model=CompareOut)
async def compare(uei: str = Query(..., min_length=12, max_length=12), linkedin_url: Optional[str] = None):
nlp = spacy.load("en_core_web_sm")
sbs = await fetch_sbs_by_uei(uei)
if not sbs.get("company_name"):
raise HTTPException(404, detail="SBS profile not found or not accessible.")
# Capability statement
cap_text = ""
cap_url = sbs.get("capstmt_url")
if cap_url:
try:
cap_text = await fetch_text(cap_url)
except:
cap_text = ""
if not cap_text and sbs.get("website"):
found = await discover_capstmt(sbs["website"])
if found:
try:
cap_text = await fetch_text(found)
except:
pass
# Website text (home/About)
web_text = ""
if sbs.get("website"):
try:
web_text = await fetch_text(sbs["website"])
except:
web_text = ""
# LinkedIn
li_text = ""
if linkedin_url:
li_text = await fetch_linkedin_company(linkedin_url)
# Build keyword sets
sbs_kw_field = ", ".join([x.strip() for x in (sbs.get("sbs_keywords") or "").split(',') if x.strip()])
sbs_text = sbs.get("capabilities","") + "\n" + sbs_kw_field
sbs_kw = set(keywordize(sbs_text, nlp))
cap_kw = set(keywordize(cap_text, nlp)) if cap_text else set()
web_kw = set(keywordize(web_text, nlp)) if web_text else set()
li_kw = set(keywordize(li_text, nlp)) if li_text else set()
all_kw = sorted(sbs_kw | cap_kw | web_kw | li_kw)
matrix = []
for k in all_kw:
matrix.append({
"keyword": k,
"sbs": k in sbs_kw,
"capstmt": k in cap_kw,
"website": k in web_kw,
"linkedin": k in li_kw,
})
# gaps: which keywords are missing per source but present elsewhere
def gaps_for(source_set):
return sorted([k for k in all_kw if k not in source_set])
gaps = {
"sbs": gaps_for(sbs_kw),
"capstmt": gaps_for(cap_kw),
"website": gaps_for(web_kw),
"linkedin": gaps_for(li_kw)
}
return CompareOut(
company_name=sbs.get("company_name"), uei=uei,
website=sbs.get("website"), linkedin_url=linkedin_url,
keywords_matrix=matrix,
gaps=gaps,
raw={
"sbs": sbs.get("capabilities",""),
"sbs_keywords": sbs.get("sbs_keywords",""),
"capstmt": cap_text[:5000],
"website": web_text[:5000],
"linkedin": li_text[:5000]