This commit is contained in:
s4luorth
2026-02-07 14:03:54 +01:00
parent 8b844b8291
commit 26b5465f53
3 changed files with 472 additions and 1027 deletions

View File

@@ -35,6 +35,20 @@ IGNORE_SELECTORS = [
CHUNK_SIZE = 9000 # stay under 10k limit
# LanguageTool rule IDs to silently ignore
IGNORED_RULES = {
"LEERZEICHEN_VOR_SATZZEICHEN",
"WHITESPACE_RULE",
"WHITESPACE_BEFORE_PUNCTUATION",
"LEERZEICHEN_VOR_DOPPELPUNKT",
"LEERZEICHEN_VOR_SEMIKOLON",
"LEERZEICHEN_VOR_AUSRUFEZEICHEN",
"LEERZEICHEN_VOR_FRAGEZEICHEN",
}
# Regex: space(s) directly before punctuation — catch any remaining cases
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+[,.:;!?\"\u201C\u201D\u201E\u201F]")
def normalize_url(url: str) -> str:
"""Remove fragment and trailing slash for dedup."""
@@ -289,6 +303,15 @@ def check_text_with_languagetool(
if resp.status_code == 200:
result = resp.json()
for match in result.get("matches", []):
rule_id = match.get("rule", {}).get("id", "")
if rule_id in IGNORED_RULES:
continue
# Extra filter: skip any match that is just whitespace before punctuation
m_off = match.get("offset", 0)
m_len = match.get("length", 0)
matched_text = chunk_text[m_off:m_off + m_len]
if SPACE_BEFORE_PUNCT_RE.fullmatch(matched_text):
continue
match["offset"] += offset
all_matches.append(match)
elif resp.status_code in (401, 403):

File diff suppressed because it is too large Load Diff