diff --git a/website-checker/__pycache__/app.cpython-310.pyc b/website-checker/__pycache__/app.cpython-310.pyc index fc5dde5..6611d46 100644 Binary files a/website-checker/__pycache__/app.cpython-310.pyc and b/website-checker/__pycache__/app.cpython-310.pyc differ diff --git a/website-checker/app.py b/website-checker/app.py index c88e863..f9e4403 100644 --- a/website-checker/app.py +++ b/website-checker/app.py @@ -35,6 +35,20 @@ IGNORE_SELECTORS = [ CHUNK_SIZE = 9000 # stay under 10k limit +# LanguageTool rule IDs to silently ignore +IGNORED_RULES = { + "LEERZEICHEN_VOR_SATZZEICHEN", + "WHITESPACE_RULE", + "WHITESPACE_BEFORE_PUNCTUATION", + "LEERZEICHEN_VOR_DOPPELPUNKT", + "LEERZEICHEN_VOR_SEMIKOLON", + "LEERZEICHEN_VOR_AUSRUFEZEICHEN", + "LEERZEICHEN_VOR_FRAGEZEICHEN", +} + +# Regex: space(s) directly before punctuation — catch any remaining cases +SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+[,.:;!?\"\u201C\u201D\u201E\u201F]") + def normalize_url(url: str) -> str: """Remove fragment and trailing slash for dedup.""" @@ -289,6 +303,15 @@ def check_text_with_languagetool( if resp.status_code == 200: result = resp.json() for match in result.get("matches", []): + rule_id = match.get("rule", {}).get("id", "") + if rule_id in IGNORED_RULES: + continue + # Extra filter: skip any match that is just whitespace before punctuation + m_off = match.get("offset", 0) + m_len = match.get("length", 0) + matched_text = chunk_text[m_off:m_off + m_len] + if SPACE_BEFORE_PUNCT_RE.fullmatch(matched_text): + continue match["offset"] += offset all_matches.append(match) elif resp.status_code in (401, 403): diff --git a/website-checker/templates/index.html b/website-checker/templates/index.html index b349a10..3505bfe 100644 --- a/website-checker/templates/index.html +++ b/website-checker/templates/index.html @@ -6,517 +6,186 @@ LanguageTool Website Checker -

LanguageTool Website Checker

← Zurück zur Übersicht -
+
Rechtschreibung
Grammatik
Stil
@@ -524,87 +193,52 @@
- -
+
In Zwischenablage kopiert
- +
-
-
-

Website prüfen

-

Crawlt alle Seiten einer Website und prüft Texte mit LanguageTool.

- -
- - -
- -
- - -
- -
- - -
- -
-
- - -
-
- - -
-
- - -
+
+

Website prüfen

+

Crawlt alle Seiten einer Website und prüft Texte mit LanguageTool.

+
+
+
+
+
+
-
+ +
+
- +
-
-
-
-
Starte...
-
-
-
-
-
-
+
+
+
Starte...
+
+
+
- +
-
-
-

Gefundene Seiten

-
-
- - - -
-
-
-
- - -
+
+

Gefundene Seiten

+
+
+ + +
-
+
+
+
+ + +
+
@@ -612,6 +246,8 @@
+ +
@@ -621,544 +257,330 @@
-
+
+} +// ==================== UTILS ==================== +function esc(s) { return s ? s.replace(/&/g,'&').replace(//g,'>').replace(/"/g,'"').replace(/'/g,''') : ''; } +function escRx(s) { return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } +document.querySelectorAll('#formView input').forEach(el => el.addEventListener('keydown', e => { if (e.key === 'Enter') startCheck(); })); +