Initial commit

2026-02-07 13:00:27 +01:00
commit 43bf686ef4
3 changed files with 627 additions and 0 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,8 @@
 {
  "permissions": {
    "allow": [
      "Bash(pip install:*)",
      "Bash(python:*)"
    ]
  }
 }
--- a/.gitignore
+++ b/.gitignore
--- a/website-checker/app.py
+++ b/website-checker/app.py
@@ -0,0 +1,619 @@
 """
 LanguageTool Website Checker - Backend
 Crawlt eine Website und prüft alle Seiten auf Rechtschreibung, Grammatik und Stil.
 """
 import asyncio
 import json
 import re
 import threading
 import time
 import uuid
 from pathlib import Path
 from urllib.parse import urljoin, urlparse
 import requests
 import uvicorn
 from bs4 import BeautifulSoup
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, StreamingResponse
 from fastapi.templating import Jinja2Templates
 app = FastAPI()
 templates = Jinja2Templates(directory=str(Path(__file__).parent / "templates"))
 # In-memory storage for active check sessions
 sessions: dict[str, dict] = {}
 IGNORE_SELECTORS = [
    "nav", "footer", "header nav", ".cookie", "#cookie", ".cookie-banner",
    "#cookie-banner", ".cookie-consent", "#cookie-consent", ".nav", ".navbar",
    ".footer", "#footer", "#nav", "#navbar", "script", "style", "noscript",
    "iframe", ".sidebar", "#sidebar", "[role='navigation']", "[role='banner']",
    ".menu", "#menu", ".breadcrumb", ".pagination",
 ]
 CHUNK_SIZE = 9000  # stay under 10k limit
 def normalize_url(url: str) -> str:
    """Remove fragment and trailing slash for dedup."""
    parsed = urlparse(url)
    path = parsed.path.rstrip("/") or "/"
    return f"{parsed.scheme}://{parsed.netloc}{path}"
 def check_domain_reachable(domain: str) -> str:
    """Check if a domain is reachable. Returns the working base URL or raises ValueError."""
    for protocol in ["https", "http"]:
        try:
            resp = requests.get(
                f"{protocol}://{domain}",
                timeout=10,
                headers={"User-Agent": "LanguageToolChecker/1.0"},
                allow_redirects=True,
            )
            if resp.status_code < 500:
                return f"{protocol}://{domain}"
        except requests.exceptions.ConnectionError:
            continue
        except requests.exceptions.Timeout:
            raise ValueError(f"Timeout: {domain} antwortet nicht innerhalb von 10 Sekunden")
        except Exception:
            continue
    raise ValueError(
        f"Domain '{domain}' ist nicht erreichbar. "
        f"Bitte prüfe die Schreibweise und ob die Website online ist."
    )
 def extract_sitemap_urls(domain: str) -> list[str]:
    """Try to load sitemap.xml and extract URLs."""
    urls = []
    for protocol in ["https", "http"]:
        try:
            resp = requests.get(
                f"{protocol}://{domain}/sitemap.xml", timeout=10,
                headers={"User-Agent": "LanguageToolChecker/1.0"},
            )
            if resp.status_code == 200 and "<url" in resp.text.lower():
                soup = BeautifulSoup(resp.text, "lxml-xml")
                # Handle sitemap index
                sitemaps = soup.find_all("sitemap")
                if sitemaps:
                    for sm in sitemaps:
                        loc = sm.find("loc")
                        if loc:
                            try:
                                sub_resp = requests.get(loc.text.strip(), timeout=10)
                                if sub_resp.status_code == 200:
                                    sub_soup = BeautifulSoup(sub_resp.text, "lxml-xml")
                                    for url_tag in sub_soup.find_all("url"):
                                        loc_tag = url_tag.find("loc")
                                        if loc_tag:
                                            urls.append(loc_tag.text.strip())
                            except Exception:
                                continue
                else:
                    for url_tag in soup.find_all("url"):
                        loc = url_tag.find("loc")
                        if loc:
                            urls.append(loc.text.strip())
                if urls:
                    return urls
        except Exception:
            continue
    return urls
 def crawl_links(domain: str, max_pages: int, progress_cb=None) -> list[str]:
    """Crawl internal links starting from the homepage."""
    base_url = f"https://{domain}"
    visited = set()
    to_visit = [base_url]
    found_urls = []
    while to_visit and len(found_urls) < max_pages:
        url = to_visit.pop(0)
        norm = normalize_url(url)
        if norm in visited:
            continue
        visited.add(norm)
        try:
            resp = requests.get(url, timeout=10, headers={"User-Agent": "LanguageToolChecker/1.0"})
            if resp.status_code != 200:
                continue
            content_type = resp.headers.get("content-type", "")
            if "text/html" not in content_type:
                continue
        except Exception:
            continue
        found_urls.append(url)
        if progress_cb:
            progress_cb(f"Crawling: {len(found_urls)} Seiten gefunden...")
        soup = BeautifulSoup(resp.text, "lxml")
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)
            # Only same domain, no fragments, no file extensions
            if parsed.netloc and parsed.netloc != domain:
                continue
            if not parsed.netloc:
                full_url = urljoin(base_url, href)
                parsed = urlparse(full_url)
            skip_ext = (".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg",
                        ".zip", ".mp3", ".mp4", ".doc", ".docx", ".xls", ".xlsx")
            if any(parsed.path.lower().endswith(ext) for ext in skip_ext):
                continue
            clean = normalize_url(full_url)
            if clean not in visited and len(to_visit) < max_pages * 3:
                to_visit.append(full_url)
    return found_urls
 def extract_page_content(html: str, url: str) -> dict:
    """Extract visible text from a page, split by semantic sections."""
    soup = BeautifulSoup(html, "lxml")
    # Remove ignored elements
    for selector in IGNORE_SELECTORS:
        try:
            for el in soup.select(selector):
                el.decompose()
        except Exception:
            continue
    sections = []
    # Title tag
    title = soup.find("title")
    if title and title.string and title.string.strip():
        sections.append({"type": "Title", "text": title.string.strip()})
    # Meta description
    meta_desc = soup.find("meta", attrs={"name": "description"})
    if meta_desc and meta_desc.get("content", "").strip():
        sections.append({"type": "Meta Description", "text": meta_desc["content"].strip()})
    # Body content
    body = soup.find("body")
    if not body:
        return {"url": url, "sections": sections}
    # Walk the DOM in document order, skipping children of already-captured elements.
    # "Block" tags are content containers (headings, paragraphs, list items, etc.)
    # "Inline" tags (a, button) are only captured when they are NOT inside a block tag.
    block_tags = {
        "h1", "h2", "h3", "h4", "h5", "h6",
        "p", "li", "blockquote", "figcaption", "td", "th",
    }
    inline_tags = {"a", "button"}
    tag_labels = {
        "h1": "Überschrift (H1)", "h2": "Überschrift (H2)", "h3": "Überschrift (H3)",
        "h4": "Überschrift (H4)", "h5": "Überschrift (H5)", "h6": "Überschrift (H6)",
        "p": "Absatz", "li": "Listeneintrag", "blockquote": "Zitat",
        "figcaption": "Bildunterschrift", "td": "Tabellenzelle", "th": "Tabellenkopf",
        "button": "Button", "a": "Link-Text",
    }
    all_relevant = block_tags | inline_tags
    seen_texts = set()
    captured_elements = set()  # track element ids so children are skipped
    for el in body.find_all(all_relevant):
        # Skip if this element is nested inside an already-captured block element
        skip = False
        for parent in el.parents:
            if id(parent) in captured_elements:
                skip = True
                break
            if parent is body:
                break
        if skip:
            continue
        tag_name = el.name
        text = el.get_text(separator=" ", strip=True)
        text = re.sub(r"\s+", " ", text).strip()
        if not text or len(text) <= 2 or text in seen_texts:
            continue
        if tag_name in inline_tags and (len(text) <= 3 or len(text) >= 200):
            continue
        seen_texts.add(text)
        label = tag_labels.get(tag_name, tag_name)
        sections.append({"type": label, "text": text})
        # Mark block elements as captured so their children are skipped
        if tag_name in block_tags:
            captured_elements.add(id(el))
    return {"url": url, "sections": sections}
 def check_text_with_languagetool(
    text: str, language: str, username: str, api_key: str
 ) -> list[dict]:
    """Send text to LanguageTool API and return matches."""
    if not text.strip():
        return []
    all_matches = []
    chunks = []
    if len(text) > CHUNK_SIZE:
        # Split at sentence boundaries
        sentences = re.split(r"(?<=[.!?])\s+", text)
        current_chunk = ""
        current_offset = 0
        for sentence in sentences:
            if len(current_chunk) + len(sentence) + 1 > CHUNK_SIZE:
                if current_chunk:
                    chunks.append((current_chunk, current_offset))
                current_offset += len(current_chunk) + 1
                current_chunk = sentence
            else:
                if current_chunk:
                    current_chunk += " " + sentence
                else:
                    current_chunk = sentence
        if current_chunk:
            chunks.append((current_chunk, current_offset))
    else:
        chunks = [(text, 0)]
    for chunk_text, offset in chunks:
        data = {
            "text": chunk_text,
            "language": language,
            "username": username,
            "apiKey": api_key,
            "enabledOnly": "false",
        }
        try:
            resp = requests.post(
                "https://api.languagetoolplus.com/v2/check",
                data=data,
                timeout=30,
            )
            if resp.status_code == 200:
                result = resp.json()
                for match in result.get("matches", []):
                    match["offset"] += offset
                    all_matches.append(match)
            elif resp.status_code in (401, 403):
                raise ValueError(f"API-Authentifizierung fehlgeschlagen: {resp.text}")
            else:
                raise ValueError(f"LanguageTool API Fehler ({resp.status_code}): {resp.text}")
        except requests.exceptions.RequestException as e:
            raise ValueError(f"Netzwerkfehler bei LanguageTool API: {e}")
        # Rate limiting: max 2 requests/sec
        time.sleep(0.5)
    return all_matches
 def categorize_match(match: dict) -> str:
    """Categorize a LanguageTool match into spelling/grammar/style."""
    rule = match.get("rule", {})
    category_id = rule.get("category", {}).get("id", "")
    issue_type = rule.get("issueType", "")
    if category_id in ("TYPOS", "SPELLING") or issue_type == "misspelling":
        return "spelling"
    elif category_id in ("GRAMMAR", "PUNCTUATION", "SYNTAX") or issue_type == "grammar":
        return "grammar"
    else:
        return "style"
 def process_matches_for_section(section_text: str, matches: list[dict], section_offset: int) -> list[dict]:
    """Filter matches that belong to this section and adjust offsets."""
    section_matches = []
    section_end = section_offset + len(section_text)
    for match in matches:
        m_offset = match.get("offset", 0)
        m_length = match.get("length", 0)
        if m_offset >= section_offset and m_offset + m_length <= section_end:
            section_matches.append({
                "offset": m_offset - section_offset,
                "length": m_length,
                "message": match.get("message", ""),
                "shortMessage": match.get("shortMessage", ""),
                "replacements": [r.get("value", "") for r in match.get("replacements", [])[:5]],
                "category": categorize_match(match),
                "rule": match.get("rule", {}).get("description", ""),
                "context_text": match.get("context", {}).get("text", ""),
            })
    return section_matches
 def crawl_domain(domain: str, max_pages: int) -> dict:
    """Crawl a domain and return the list of found URLs (synchronous)."""
    check_domain_reachable(domain)
    urls = extract_sitemap_urls(domain)
    sitemap_used = bool(urls)
    if not urls:
        urls = crawl_links(domain, max_pages)
    urls = urls[:max_pages]
    return {"urls": urls, "sitemap_used": sitemap_used}
 def run_check(session_id: str, domain: str, language: str,
              username: str, api_key: str, urls: list[str]):
    """Run the check pipeline for a given list of URLs (synchronous, runs in a thread)."""
    session = sessions[session_id]
    try:
        session["status"] = "checking"
        session["progress"] = {"current": 0, "total": len(urls), "page": ""}
        session["message"] = f"Prüfe {len(urls)} Seiten..."
        results = []
        total_errors = {"spelling": 0, "grammar": 0, "style": 0}
        for i, url in enumerate(urls):
            session["progress"]["current"] = i + 1
            session["progress"]["page"] = url
            session["message"] = f"Prüfe Seite {i + 1}/{len(urls)}: {url}"
            page_result = {
                "url": url,
                "sections": [],
                "error_count": {"spelling": 0, "grammar": 0, "style": 0},
                "total_errors": 0,
                "skipped": False,
                "error_message": None,
            }
            try:
                resp = requests.get(
                    url, timeout=10,
                    headers={"User-Agent": "LanguageToolChecker/1.0"}
                )
                if resp.status_code != 200:
                    page_result["skipped"] = True
                    page_result["error_message"] = f"HTTP {resp.status_code}"
                    results.append(page_result)
                    continue
                content_type = resp.headers.get("content-type", "")
                if "text/html" not in content_type:
                    page_result["skipped"] = True
                    page_result["error_message"] = f"Kein HTML: {content_type}"
                    results.append(page_result)
                    continue
                page_data = extract_page_content(resp.text, url)
                # Build full text for API check
                full_text_parts = []
                section_offsets = []
                current_offset = 0
                for section in page_data["sections"]:
                    section_offsets.append(current_offset)
                    full_text_parts.append(section["text"])
                    current_offset += len(section["text"]) + 1  # +1 for newline
                full_text = "\n".join(full_text_parts)
                if not full_text.strip():
                    page_result["sections"] = [
                        {"type": s["type"], "text": s["text"], "matches": []}
                        for s in page_data["sections"]
                    ]
                    results.append(page_result)
                    continue
                # Check with LanguageTool
                try:
                    matches = check_text_with_languagetool(
                        full_text, language, username, api_key
                    )
                except ValueError as e:
                    error_msg = str(e)
                    if "Authentifizierung" in error_msg:
                        session["status"] = "error"
                        session["message"] = error_msg
                        return
                    page_result["skipped"] = True
                    page_result["error_message"] = error_msg
                    results.append(page_result)
                    continue
                # Distribute matches to sections
                for j, section in enumerate(page_data["sections"]):
                    sec_offset = section_offsets[j] if j < len(section_offsets) else 0
                    sec_matches = process_matches_for_section(
                        section["text"], matches, sec_offset
                    )
                    page_result["sections"].append({
                        "type": section["type"],
                        "text": section["text"],
                        "matches": sec_matches,
                    })
                    for m in sec_matches:
                        cat = m["category"]
                        page_result["error_count"][cat] += 1
                        total_errors[cat] += 1
                page_result["total_errors"] = sum(page_result["error_count"].values())
            except requests.exceptions.Timeout:
                page_result["skipped"] = True
                page_result["error_message"] = "Timeout (>10s)"
            except Exception as e:
                page_result["skipped"] = True
                page_result["error_message"] = str(e)[:200]
            results.append(page_result)
        # Sort by error count descending
        results.sort(key=lambda r: r["total_errors"], reverse=True)
        session["status"] = "done"
        session["message"] = "Prüfung abgeschlossen"
        session["results"] = {
            "domain": domain,
            "language": language,
            "pages_checked": len(results),
            "pages_skipped": sum(1 for r in results if r["skipped"]),
            "total_errors": total_errors,
            "pages": results,
        }
    except Exception as e:
        session["status"] = "error"
        session["message"] = f"Unerwarteter Fehler: {str(e)[:300]}"
@app.get("/", response_class=HTMLResponse)
 async def index(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})
@app.post("/api/crawl")
 async def crawl(request: Request):
    """Crawl a domain and return the list of URLs for preview."""
    body = await request.json()
    domain = body.get("domain", "").strip().lower()
    max_pages = int(body.get("maxPages", 50))
    domain = re.sub(r"^https?://", "", domain)
    domain = domain.rstrip("/")
    if not domain:
        return {"error": "Bitte eine Domain eingeben"}
    try:
        result = crawl_domain(domain, max_pages)
        return {
            "urls": result["urls"],
            "sitemap_used": result["sitemap_used"],
            "domain": domain,
        }
    except ValueError as e:
        return {"error": str(e)}
@app.post("/api/check")
 async def start_check(request: Request):
    body = await request.json()
    domain = body.get("domain", "").strip().lower()
    language = body.get("language", "de-DE")
    username = body.get("username", "").strip()
    api_key = body.get("apiKey", "").strip()
    urls = body.get("urls", [])
    domain = re.sub(r"^https?://", "", domain)
    domain = domain.rstrip("/")
    if not domain:
        return {"error": "Bitte eine Domain eingeben"}
    if not username or not api_key:
        return {"error": "Bitte LanguageTool Credentials eingeben"}
    if not urls:
        return {"error": "Keine URLs zum Prüfen ausgewählt"}
    session_id = str(uuid.uuid4())
    sessions[session_id] = {
        "status": "starting",
        "message": "Starte Prüfung...",
        "progress": {"current": 0, "total": 0, "page": ""},
        "results": None,
    }
    thread = threading.Thread(
        target=run_check,
        args=(session_id, domain, language, username, api_key, urls),
        daemon=True,
    )
    thread.start()
    return {"sessionId": session_id}
@app.get("/api/status/{session_id}")
 async def get_status(session_id: str):
    session = sessions.get(session_id)
    if not session:
        return {"error": "Session nicht gefunden"}
    return {
        "status": session["status"],
        "message": session["message"],
        "progress": session["progress"],
    }
@app.get("/api/results/{session_id}")
 async def get_results(session_id: str):
    session = sessions.get(session_id)
    if not session:
        return {"error": "Session nicht gefunden"}
    if session["status"] != "done":
        return {"error": "Prüfung noch nicht abgeschlossen", "status": session["status"]}
    results = session["results"]
    # Clean up session after delivering results
    # (keep it around for a bit in case of re-requests)
    return results
@app.get("/api/stream/{session_id}")
 async def stream_status(session_id: str):
    """SSE endpoint for live progress updates."""
    async def event_generator():
        while True:
            session = sessions.get(session_id)
            if not session:
                yield f"data: {json.dumps({'status': 'error', 'message': 'Session nicht gefunden'})}\n\n"
                break
            payload = {
                "status": session["status"],
                "message": session["message"],
                "progress": session["progress"],
            }
            yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
            if session["status"] in ("done", "error"):
                break
            await asyncio.sleep(0.5)
    return StreamingResponse(
        event_generator(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",
        },
    )
 if __name__ == "__main__":
    print("\n  LanguageTool Website Checker")
    print("  ============================")
    print("  Öffne http://localhost:8000 im Browser\n")
    uvicorn.run(app, host="0.0.0.0", port=8000)