diff --git a/website-checker/__pycache__/app.cpython-310.pyc b/website-checker/__pycache__/app.cpython-310.pyc index fc5dde5..6611d46 100644 Binary files a/website-checker/__pycache__/app.cpython-310.pyc and b/website-checker/__pycache__/app.cpython-310.pyc differ diff --git a/website-checker/app.py b/website-checker/app.py index c88e863..f9e4403 100644 --- a/website-checker/app.py +++ b/website-checker/app.py @@ -35,6 +35,20 @@ IGNORE_SELECTORS = [ CHUNK_SIZE = 9000 # stay under 10k limit +# LanguageTool rule IDs to silently ignore +IGNORED_RULES = { + "LEERZEICHEN_VOR_SATZZEICHEN", + "WHITESPACE_RULE", + "WHITESPACE_BEFORE_PUNCTUATION", + "LEERZEICHEN_VOR_DOPPELPUNKT", + "LEERZEICHEN_VOR_SEMIKOLON", + "LEERZEICHEN_VOR_AUSRUFEZEICHEN", + "LEERZEICHEN_VOR_FRAGEZEICHEN", +} + +# Regex: space(s) directly before punctuation — catch any remaining cases +SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+[,.:;!?\"\u201C\u201D\u201E\u201F]") + def normalize_url(url: str) -> str: """Remove fragment and trailing slash for dedup.""" @@ -289,6 +303,15 @@ def check_text_with_languagetool( if resp.status_code == 200: result = resp.json() for match in result.get("matches", []): + rule_id = match.get("rule", {}).get("id", "") + if rule_id in IGNORED_RULES: + continue + # Extra filter: skip any match that is just whitespace before punctuation + m_off = match.get("offset", 0) + m_len = match.get("length", 0) + matched_text = chunk_text[m_off:m_off + m_len] + if SPACE_BEFORE_PUNCT_RE.fullmatch(matched_text): + continue match["offset"] += offset all_matches.append(match) elif resp.status_code in (401, 403): diff --git a/website-checker/templates/index.html b/website-checker/templates/index.html index b349a10..3505bfe 100644 --- a/website-checker/templates/index.html +++ b/website-checker/templates/index.html @@ -6,517 +6,186 @@
Crawlt alle Seiten einer Website und prüft Texte mit LanguageTool.
- -Crawlt alle Seiten einer Website und prüft Texte mit LanguageTool.
+ + + +