Keyword Clarity Exercise

from fastapi import FastAPI, HTTPException, Query from pydantic import BaseModel from typing import List, Dict, Optional import re, os, io, json, asyncio import httpx from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse # PDF import fitz # PyMuPDF # NLP import spacy from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP # Headless browser for SBS (Playwright) from playwright.async_api import async_playwright app = FastAPI(title="SBS Keyword Alignment API") LINKEDIN_API = os.getenv("LINKEDIN_API") # e.g., Scrapingdog key SOURCE = ["sbs", "capstmt", "website", "linkedin"] class CompareOut(BaseModel): company_name: Optional[str] uei: Optional[str] website: Optional[str] linkedin_url: Optional[str] keywords_matrix: List[Dict] # {"keyword": str, "sbs": bool, "capstmt": bool, "website": bool, "linkedin": bool} gaps: Dict[str, List[str]] # per source missing raw: Dict[str, str] # raw extracted text per source # ---------- Helpers ---------- def normalize_terms(terms: List[str]) -> List[str]: # Lowercase, strip, common merges (cyber security->cybersecurity, it -> IT) out = [] for t in terms: k = re.sub(r"\s+", " ", t.strip().lower()) k = k.replace("cyber security", "cybersecurity") k = k.replace("a.i.", "ai").replace("machine-learning", "machine learning") out.append(k) return out def keywordize(text: str, nlp) -> List[str]: if not text: return [] text = re.sub(r"\s+", " ", text) doc = nlp(text) phrases = set() # 1) Keep explicit noun chunks for np in doc.noun_chunks: s = np.text.strip() if len(s) >= 3: phrases.add(s) # 2) Add individual nouns/proper nouns for tok in doc: if tok.pos_ in {"NOUN", "PROPN"} and tok.text.lower() not in SPACY_STOP and len(tok.text) > 2: phrases.add(tok.lemma_.strip()) # Basic cleaning ph = normalize_terms(list(phrases)) # Remove generic words generic = {"company","business","solution","service","services","provider","firm"} ph = [p for p in ph if p not in generic] # Merge trivial plurals ph = list({re.sub(r"s$","", p) for p in ph}) return sorted(ph) async def fetch_sbs_by_uei(uei: str) -> Dict[str, str]: # Uses Playwright to open SBS, search UEI, open profile, extract fields out = {"company_name":"","capabilities":"","sbs_keywords":"","website":"","capstmt_url":"","past_perf":""} async with async_playwright() as pw: browser = await pw.chromium.launch() ctx = await browser.new_context() page = await ctx.new_page() await page.goto("https://search.certifications.sba.gov/", wait_until="load") # Type UEI into main search box (uses placeholder 'Search by business name, UEI, CAGE...') search_sel = "input[placeholder*='Search']" await page.fill(search_sel, uei) await page.keyboard.press("Enter") await page.wait_for_selector("text=Results") # Click first matching row (assumes unique UEI) await page.click("tr:has-text('" + uei + "')") # Wait profile pane await page.wait_for_selector("text=Capabilities Narrative") # Extract fields by visible labels def grab(label): return page.locator(f"xpath=//div[.//text()[contains(., '{label}')]]").inner_text() try: out["company_name"] = await page.locator("h1, h2").first.inner_text() except: pass try: cap = await page.get_by_text("Capabilities Narrative").locator("xpath=following::div[1]").inner_text() out["capabilities"] = cap except: pass try: kw = await page.get_by_text("Keywords").locator("xpath=following::div[1]").inner_text() out["sbs_keywords"] = kw except: pass try: web = await page.get_by_text("WWW Page").locator("xpath=following::a[1]").get_attribute("href") out["website"] = web or "" except: pass # Capability Statement link may appear as a labeled link try: caplnk = await page.get_by_text("Capability Statement").locator("xpath=following::a[1]").get_attribute("href") out["capstmt_url"] = caplnk or "" except: pass # Past Performance/References label try: pp = await page.get_by_text("Performance History").locator("xpath=following::div[1]").inner_text() out["past_perf"] = pp except: pass await browser.close() return out async def fetch_text(url: str) -> str: async with httpx.AsyncClient(timeout=20) as client: r = await client.get(url, follow_redirects=True, headers={"User-Agent":"Mozilla/5.0"}) r.raise_for_status() ct = r.headers.get("content-type","") if "pdf" in ct or url.lower().endswith(".pdf"): # PDF -> text with fitz.open(stream=r.content, filetype="pdf") as doc: parts = [] for p in doc: parts.append(p.get_text()) return "\n".join(parts) # HTML soup = BeautifulSoup(r.text, "html.parser") # remove scripts/styles/nav for tag in soup(["script","style","nav","footer","header"]): tag.decompose() text = "\n".join(t.get_text(" ", strip=True) for t in soup.find_all(["h1","h2","h3","p","li"])) return re.sub(r"\s+"," ", text) async def discover_capstmt(site_url: str) -> Optional[str]: # Try common paths and in-page links candidates = ["/capability-statement.pdf","/CapabilityStatement.pdf","/capabilities.pdf","/resources/capability-statement.pdf"] base = site_url.rstrip('/') async with httpx.AsyncClient(timeout=15) as client: # home page scan for PDF links containing 'capab' try: r = await client.get(base, headers={"User-Agent":"Mozilla/5.0"}) soup = BeautifulSoup(r.text, "html.parser") for a in soup.find_all("a", href=True): href = a["href"] if href.lower().endswith(".pdf") and "capab" in href.lower(): return urljoin(base+"/", href) except: pass for p in candidates: try: rr = await client.head(base + p) if rr.status_code == 200: return base + p except: continue return None async def fetch_linkedin_company(link: str) -> str: if not LINKEDIN_API: return "" # Example Scrapingdog api = f"https://api.scrapingdog.com/linkedin?api_key={LINKEDIN_API}&type=company&link={link}" async with httpx.AsyncClient(timeout=30) as client: r = await client.get(api) if r.status_code != 200: return "" data = r.json() # best-effort fields: 'about', 'description', 'specialities' about = data.get("about") or data.get("description") or "" specs = ", ".join(data.get("specialities", []) or []) return about + ("\nSpecialties: " + specs if specs else "") @app.get("/compare", response_model=CompareOut) async def compare(uei: str = Query(..., min_length=12, max_length=12), linkedin_url: Optional[str] = None): nlp = spacy.load("en_core_web_sm") sbs = await fetch_sbs_by_uei(uei) if not sbs.get("company_name"): raise HTTPException(404, detail="SBS profile not found or not accessible.") # Capability statement cap_text = "" cap_url = sbs.get("capstmt_url") if cap_url: try: cap_text = await fetch_text(cap_url) except: cap_text = "" if not cap_text and sbs.get("website"): found = await discover_capstmt(sbs["website"]) if found: try: cap_text = await fetch_text(found) except: pass # Website text (home/About) web_text = "" if sbs.get("website"): try: web_text = await fetch_text(sbs["website"]) except: web_text = "" # LinkedIn li_text = "" if linkedin_url: li_text = await fetch_linkedin_company(linkedin_url) # Build keyword sets sbs_kw_field = ", ".join([x.strip() for x in (sbs.get("sbs_keywords") or "").split(',') if x.strip()]) sbs_text = sbs.get("capabilities","") + "\n" + sbs_kw_field sbs_kw = set(keywordize(sbs_text, nlp)) cap_kw = set(keywordize(cap_text, nlp)) if cap_text else set() web_kw = set(keywordize(web_text, nlp)) if web_text else set() li_kw = set(keywordize(li_text, nlp)) if li_text else set() all_kw = sorted(sbs_kw | cap_kw | web_kw | li_kw) matrix = [] for k in all_kw: matrix.append({ "keyword": k, "sbs": k in sbs_kw, "capstmt": k in cap_kw, "website": k in web_kw, "linkedin": k in li_kw, }) # gaps: which keywords are missing per source but present elsewhere def gaps_for(source_set): return sorted([k for k in all_kw if k not in source_set]) gaps = { "sbs": gaps_for(sbs_kw), "capstmt": gaps_for(cap_kw), "website": gaps_for(web_kw), "linkedin": gaps_for(li_kw) } return CompareOut( company_name=sbs.get("company_name"), uei=uei, website=sbs.get("website"), linkedin_url=linkedin_url, keywords_matrix=matrix, gaps=gaps, raw={ "sbs": sbs.get("capabilities",""), "sbs_keywords": sbs.get("sbs_keywords",""), "capstmt": cap_text[:5000], "website": web_text[:5000], "linkedin": li_text[:5000]

Keyword Power Tool

SBS Keyword Alignment

Join Our Free Trial