Default
From Helix Project Wiki
type=code lang=python import requests, re, json from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse
BASE = "https://helixprojectai.com" MAIN = f"{BASE}/wiki/index.php/Main_Page"
def fetch(url):
r = requests.get(url, timeout=15) r.raise_for_status() return r.text
- 1️⃣ fetch main page
html = fetch(MAIN) soup = BeautifulSoup(html, "html.parser")
- 2️⃣ collect classes & ids
classes = set() ids = set() for tag in soup.find_all(True):
if tag.has_attr("class"):
for c in tag["class"]:
classes.add(c.strip())
if tag.has_attr("id"):
ids.add(tag["id"].strip())
- 3️⃣ find linked stylesheets
stylesheets = [] for link in soup.find_all("link", rel=lambda x: x and "stylesheet" in x):
href = link.get("href")
if href:
full = urljoin(MAIN, href)
stylesheets.append(full)
- 4️⃣ fetch each stylesheet and pull selectors that match our classes/ids
selector_patterns = set() css_contents = {} for css_url in stylesheets:
try:
css_text = fetch(css_url)
css_contents[css_url] = css_text
# Very simple regex: capture selectors before `{`
for selector_block in re.findall(r"([^{]+){", css_text):
# split by commas, strip whitespace
for sel in selector_block.split(","):
sel = sel.strip()
# keep only selectors that are class or id we already saw
if sel.startswith(".") or sel.startswith("#"):
name = sel[1:] # strip leading . or #
if (sel.startswith(".") and name in classes) or (sel.startswith("#") and name in ids):
selector_patterns.add(sel)
except Exception as e:
print(f"⚠️ Could not fetch {css_url}: {e}")
- 5️⃣ Summarize
summary = {
"page_url": MAIN, "num_classes_found": len(classes), "num_ids_found": len(ids), "unique_classes": sorted(classes)[:30], # first 30 as sample "unique_ids": sorted(ids)[:30], "linked_stylesheets": stylesheets, "extracted_selectors": sorted(selector_patterns)[:40] # first 40 as sample
}
print(json.dumps(summary, indent=2))
