bug fix
This commit is contained in:
parent
feeb7c2d92
commit
303efefcb7
|
|
@ -1,11 +1,15 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_extractors.py
|
FILE: app/core/graph/graph_extractors.py
|
||||||
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
||||||
|
AUDIT: Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen Support).
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
# Fix: Erlaube alle Zeichen außer ']' im Target, statt nur a-z0-9.
|
||||||
|
# Das fängt Umlaute, Emojis, '&', '#' und Leerzeichen ab.
|
||||||
|
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]")
|
||||||
|
|
||||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
|
|
@ -14,12 +18,16 @@ _CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||||
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||||
|
|
||||||
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
||||||
"""Extrahiert [[rel:KIND|Target]]."""
|
"""
|
||||||
|
Findet Inline-Relationen wie [[rel:depends_on Target]].
|
||||||
|
Gibt (Liste[(kind, target)], bereinigter_text) zurück.
|
||||||
|
"""
|
||||||
|
if not text: return [], ""
|
||||||
pairs = []
|
pairs = []
|
||||||
def _collect(m):
|
def _collect(m):
|
||||||
k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip()
|
k, t = m.group("kind").strip().lower(), m.group("target").strip()
|
||||||
if k and t: pairs.append((k, t))
|
pairs.append((k, t))
|
||||||
return ""
|
return ""
|
||||||
text = _REL_PIPE.sub(_collect, text)
|
text = _REL_PIPE.sub(_collect, text)
|
||||||
text = _REL_SPACE.sub(_collect, text)
|
text = _REL_SPACE.sub(_collect, text)
|
||||||
|
|
@ -45,11 +53,10 @@ def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||||
found = _WIKILINKS_IN_LINE.findall(targets)
|
found = _WIKILINKS_IN_LINE.findall(targets)
|
||||||
if found:
|
if found:
|
||||||
for t in found: out_pairs.append((kind, t.strip()))
|
for t in found: out_pairs.append((kind, t.strip()))
|
||||||
else:
|
|
||||||
for raw in re.split(r"[,;]", targets):
|
|
||||||
if raw.strip(): out_pairs.append((kind, raw.strip()))
|
|
||||||
return out_pairs, "\n".join(keep_lines)
|
return out_pairs, "\n".join(keep_lines)
|
||||||
|
|
||||||
def extract_wikilinks(text: str) -> List[str]:
|
def extract_wikilinks(text: str) -> List[str]:
|
||||||
"""Extrahiert Standard-Wikilinks."""
|
"""Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]]."""
|
||||||
return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")]
|
if not text: return []
|
||||||
|
# match.group(1) ist jetzt das Target (dank des fixierten Regex)
|
||||||
|
return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]
|
||||||
Loading…
Reference in New Issue
Block a user