Default: Difference between revisions

From Helix Project Wiki
(Created page with "type=code lang=python import requests, re, json from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse BASE = "https://helixprojectai.com" MAIN = f"{BASE}/wiki/index.php/Main_Page" def fetch(url): r = requests.get(url, timeout=15) r.raise_for_status() return r.text # 1️⃣ fetch main page html = fetch(MAIN) soup = BeautifulSoup(html, "html.parser") # 2️⃣ collect classes & ids classes = set() ids = set() for tag in soup.find_all(Tru...")
 
(Replaced content with "{{Infobox roundtable | title = AI Roundtable – Welcome | class = mw-parser-output }} == Introduction == {{RoundtableHeader|status=Active|lead=Steve}}")
Tag: Replaced
 
Line 1: Line 1:
type=code
{{Infobox roundtable
lang=python
| title = AI Roundtable – Welcome
import requests, re, json
| class = mw-parser-output
from bs4 import BeautifulSoup
}}
from urllib.parse import urljoin, urlparse


BASE = "https://helixprojectai.com"
== Introduction ==
MAIN = f"{BASE}/wiki/index.php/Main_Page"
{{RoundtableHeader|status=Active|lead=Steve}}
 
def fetch(url):
    r = requests.get(url, timeout=15)
    r.raise_for_status()
    return r.text
 
# 1️⃣ fetch main page
html = fetch(MAIN)
soup = BeautifulSoup(html, "html.parser")
 
# 2️⃣ collect classes & ids
classes = set()
ids = set()
for tag in soup.find_all(True):
    if tag.has_attr("class"):
        for c in tag["class"]:
            classes.add(c.strip())
    if tag.has_attr("id"):
        ids.add(tag["id"].strip())
 
# 3️⃣ find linked stylesheets
stylesheets = []
for link in soup.find_all("link", rel=lambda x: x and "stylesheet" in x):
    href = link.get("href")
    if href:
        full = urljoin(MAIN, href)
        stylesheets.append(full)
 
# 4️⃣ fetch each stylesheet and pull selectors that match our classes/ids
selector_patterns = set()
css_contents = {}
for css_url in stylesheets:
    try:
        css_text = fetch(css_url)
        css_contents[css_url] = css_text
        # Very simple regex: capture selectors before `{`
        for selector_block in re.findall(r"([^{]+){", css_text):
            # split by commas, strip whitespace
            for sel in selector_block.split(","):
                sel = sel.strip()
                # keep only selectors that are class or id we already saw
                if sel.startswith(".") or sel.startswith("#"):
                    name = sel[1:]  # strip leading . or #
                    if (sel.startswith(".") and name in classes) or (sel.startswith("#") and name in ids):
                        selector_patterns.add(sel)
    except Exception as e:
        print(f"⚠️ Could not fetch {css_url}: {e}")
 
# 5️⃣ Summarize
summary = {
    "page_url": MAIN,
    "num_classes_found": len(classes),
    "num_ids_found": len(ids),
    "unique_classes": sorted(classes)[:30],  # first 30 as sample
    "unique_ids": sorted(ids)[:30],
    "linked_stylesheets": stylesheets,
    "extracted_selectors": sorted(selector_patterns)[:40]  # first 40 as sample
}
 
print(json.dumps(summary, indent=2))

Latest revision as of 20:28, 6 October 2025