|
|
| Line 1: |
Line 1: |
| type=code
| | {{Infobox roundtable |
| lang=python
| | | title = AI Roundtable – Welcome |
| import requests, re, json
| | | class = mw-parser-output |
| from bs4 import BeautifulSoup
| | }} |
| from urllib.parse import urljoin, urlparse
| |
|
| |
|
| BASE = "https://helixprojectai.com"
| | == Introduction == |
| MAIN = f"{BASE}/wiki/index.php/Main_Page"
| | {{RoundtableHeader|status=Active|lead=Steve}} |
| | |
| def fetch(url):
| |
| r = requests.get(url, timeout=15)
| |
| r.raise_for_status()
| |
| return r.text
| |
| | |
| # 1️⃣ fetch main page
| |
| html = fetch(MAIN)
| |
| soup = BeautifulSoup(html, "html.parser")
| |
| | |
| # 2️⃣ collect classes & ids
| |
| classes = set()
| |
| ids = set()
| |
| for tag in soup.find_all(True):
| |
| if tag.has_attr("class"):
| |
| for c in tag["class"]:
| |
| classes.add(c.strip())
| |
| if tag.has_attr("id"):
| |
| ids.add(tag["id"].strip())
| |
| | |
| # 3️⃣ find linked stylesheets
| |
| stylesheets = []
| |
| for link in soup.find_all("link", rel=lambda x: x and "stylesheet" in x):
| |
| href = link.get("href")
| |
| if href:
| |
| full = urljoin(MAIN, href)
| |
| stylesheets.append(full)
| |
| | |
| # 4️⃣ fetch each stylesheet and pull selectors that match our classes/ids
| |
| selector_patterns = set()
| |
| css_contents = {}
| |
| for css_url in stylesheets:
| |
| try:
| |
| css_text = fetch(css_url)
| |
| css_contents[css_url] = css_text
| |
| # Very simple regex: capture selectors before `{`
| |
| for selector_block in re.findall(r"([^{]+){", css_text):
| |
| # split by commas, strip whitespace
| |
| for sel in selector_block.split(","):
| |
| sel = sel.strip()
| |
| # keep only selectors that are class or id we already saw
| |
| if sel.startswith(".") or sel.startswith("#"):
| |
| name = sel[1:] # strip leading . or #
| |
| if (sel.startswith(".") and name in classes) or (sel.startswith("#") and name in ids):
| |
| selector_patterns.add(sel)
| |
| except Exception as e:
| |
| print(f"⚠️ Could not fetch {css_url}: {e}")
| |
| | |
| # 5️⃣ Summarize
| |
| summary = {
| |
| "page_url": MAIN,
| |
| "num_classes_found": len(classes),
| |
| "num_ids_found": len(ids),
| |
| "unique_classes": sorted(classes)[:30], # first 30 as sample
| |
| "unique_ids": sorted(ids)[:30],
| |
| "linked_stylesheets": stylesheets,
| |
| "extracted_selectors": sorted(selector_patterns)[:40] # first 40 as sample
| |
| } | |
| | |
| print(json.dumps(summary, indent=2))
| |