bug fix
This commit is contained in:
parent
e180018c99
commit
857ba953e3
|
|
@ -1,13 +1,14 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_extractors.py
|
FILE: app/core/graph/graph_extractors.py
|
||||||
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
||||||
AUDIT: Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen Support).
|
AUDIT:
|
||||||
|
- Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen).
|
||||||
|
- Callout-Parser erweitert für Multi-Line-Listen und Header-Typen.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
# Fix: Erlaube alle Zeichen außer ']' im Target, statt nur a-z0-9.
|
# Erlaube alle Zeichen außer ']' im Target (fängt Umlaute, Emojis, '&', '#' ab)
|
||||||
# Das fängt Umlaute, Emojis, '&', '#' und Leerzeichen ab.
|
|
||||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]")
|
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]")
|
||||||
|
|
||||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
|
|
@ -15,8 +16,10 @@ _REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\
|
||||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
|
|
||||||
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||||
|
# Erkennt "kind: targets..."
|
||||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||||
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
# Erkennt reine Typen (z.B. "depends_on" im Header)
|
||||||
|
_SIMPLE_KIND = re.compile(r"^[a-z_]+$", re.IGNORECASE)
|
||||||
|
|
||||||
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -35,28 +38,90 @@ def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
||||||
return pairs, text
|
return pairs, text
|
||||||
|
|
||||||
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||||
"""Verarbeitet Obsidian [!edge]-Callouts."""
|
"""
|
||||||
|
Verarbeitet Obsidian [!edge]-Callouts.
|
||||||
|
Unterstützt zwei Formate:
|
||||||
|
1. Explizit: "kind: [[Target]]"
|
||||||
|
2. Implizit (Header): "> [!edge] kind" gefolgt von "[[Target]]" Zeilen
|
||||||
|
"""
|
||||||
if not text: return [], text
|
if not text: return [], text
|
||||||
lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0
|
lines = text.splitlines()
|
||||||
|
out_pairs = []
|
||||||
|
keep_lines = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
while i < len(lines):
|
while i < len(lines):
|
||||||
m = _CALLOUT_START.match(lines[i])
|
line = lines[i]
|
||||||
|
m = _CALLOUT_START.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
keep_lines.append(lines[i]); i += 1; continue
|
keep_lines.append(line)
|
||||||
block_lines = [m.group(1)] if m.group(1).strip() else []
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Callout-Block gefunden. Wir sammeln alle relevanten Zeilen.
|
||||||
|
block_lines = []
|
||||||
|
|
||||||
|
# Header Content prüfen (z.B. "type" aus "> [!edge] type")
|
||||||
|
header_raw = m.group(1).strip()
|
||||||
|
if header_raw:
|
||||||
|
block_lines.append(header_raw)
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
||||||
block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1
|
# Entferne '>' und führende Leerzeichen
|
||||||
|
content = lines[i].lstrip()[1:].lstrip()
|
||||||
|
if content:
|
||||||
|
block_lines.append(content)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Verarbeitung des Blocks
|
||||||
|
current_kind = None
|
||||||
|
|
||||||
|
# Heuristik: Ist die allererste Zeile (meist aus dem Header) ein reiner Typ?
|
||||||
|
# Dann setzen wir diesen als Default für den Block.
|
||||||
|
if block_lines:
|
||||||
|
first = block_lines[0]
|
||||||
|
# Wenn es NICHT wie "Key: Value" aussieht, aber wie ein Wort:
|
||||||
|
if not _REL_LINE.match(first) and _SIMPLE_KIND.match(first):
|
||||||
|
current_kind = first.lower()
|
||||||
|
|
||||||
for bl in block_lines:
|
for bl in block_lines:
|
||||||
|
# 1. Prüfen auf explizites "Kind: Targets" (überschreibt Header-Typ für diese Zeile)
|
||||||
mrel = _REL_LINE.match(bl)
|
mrel = _REL_LINE.match(bl)
|
||||||
if not mrel: continue
|
if mrel:
|
||||||
kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or ""
|
line_kind = mrel.group("kind").strip().lower()
|
||||||
found = _WIKILINKS_IN_LINE.findall(targets)
|
targets = mrel.group("targets")
|
||||||
|
|
||||||
|
# Links extrahieren
|
||||||
|
found = _WIKILINK_RE.findall(targets)
|
||||||
if found:
|
if found:
|
||||||
for t in found: out_pairs.append((kind, t.strip()))
|
for t in found: out_pairs.append((line_kind, t.strip()))
|
||||||
|
else:
|
||||||
|
# Fallback für kommagetrennten Plaintext
|
||||||
|
for raw in re.split(r"[,;]", targets):
|
||||||
|
if raw.strip(): out_pairs.append((line_kind, raw.strip()))
|
||||||
|
|
||||||
|
# Wenn wir eine explizite Zeile gefunden haben, aktualisieren wir NICHT
|
||||||
|
# den current_kind für nachfolgende Zeilen (Design-Entscheidung: lokal scope),
|
||||||
|
# oder wir machen es doch?
|
||||||
|
# Üblicher ist: Header setzt Default, Zeile überschreibt lokal.
|
||||||
|
# Wir lassen current_kind also unangetastet.
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Kein Key:Value Muster -> Prüfen auf Links, die den current_kind nutzen
|
||||||
|
found = _WIKILINK_RE.findall(bl)
|
||||||
|
if found:
|
||||||
|
if current_kind:
|
||||||
|
for t in found: out_pairs.append((current_kind, t.strip()))
|
||||||
|
else:
|
||||||
|
# Link ohne Typ und ohne Header-Typ.
|
||||||
|
# Wird ignoriert oder könnte als 'related_to' fallback dienen.
|
||||||
|
# Aktuell: Ignorieren, um False Positives zu vermeiden.
|
||||||
|
pass
|
||||||
|
|
||||||
return out_pairs, "\n".join(keep_lines)
|
return out_pairs, "\n".join(keep_lines)
|
||||||
|
|
||||||
def extract_wikilinks(text: str) -> List[str]:
|
def extract_wikilinks(text: str) -> List[str]:
|
||||||
"""Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]]."""
|
"""Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]]."""
|
||||||
if not text: return []
|
if not text: return []
|
||||||
# match.group(1) ist jetzt das Target (dank des fixierten Regex)
|
|
||||||
return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]
|
return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]
|
||||||
|
|
@ -10,15 +10,19 @@ LAST_ANALYSIS: 2025-12-15
|
||||||
import re
|
import re
|
||||||
from qdrant_client import QdrantClient, models
|
from qdrant_client import QdrantClient, models
|
||||||
from streamlit_agraph import Node, Edge
|
from streamlit_agraph import Node, Edge
|
||||||
from ui_config import GRAPH_COLORS, get_edge_color, SYSTEM_EDGES
|
from ui_config import COLLECTION_PREFIX, GRAPH_COLORS, get_edge_color, SYSTEM_EDGES
|
||||||
|
|
||||||
class GraphExplorerService:
|
class GraphExplorerService:
|
||||||
def __init__(self, url, api_key=None, prefix="mindnet"):
|
def __init__(self, url, api_key=None, prefix=None):
|
||||||
|
"""
|
||||||
|
Initialisiert den Service. Nutzt COLLECTION_PREFIX aus der Config,
|
||||||
|
sofern kein spezifischer Prefix übergeben wurde.
|
||||||
|
"""
|
||||||
self.client = QdrantClient(url=url, api_key=api_key)
|
self.client = QdrantClient(url=url, api_key=api_key)
|
||||||
self.prefix = prefix
|
self.prefix = prefix if prefix else COLLECTION_PREFIX
|
||||||
self.notes_col = f"{prefix}_notes"
|
self.notes_col = f"{self.prefix}_notes"
|
||||||
self.chunks_col = f"{prefix}_chunks"
|
self.chunks_col = f"{self.prefix}_chunks"
|
||||||
self.edges_col = f"{prefix}_edges"
|
self.edges_col = f"{self.prefix}_edges"
|
||||||
self._note_cache = {}
|
self._note_cache = {}
|
||||||
|
|
||||||
def get_note_with_full_content(self, note_id):
|
def get_note_with_full_content(self, note_id):
|
||||||
|
|
@ -421,7 +425,7 @@ class GraphExplorerService:
|
||||||
|
|
||||||
def _add_node_to_dict(self, node_dict, note_payload, level=1):
|
def _add_node_to_dict(self, node_dict, note_payload, level=1):
|
||||||
nid = note_payload.get("note_id")
|
nid = note_payload.get("note_id")
|
||||||
if nid in node_dict: return
|
if not nid or nid in node_dict: return
|
||||||
|
|
||||||
ntype = note_payload.get("type", "default")
|
ntype = note_payload.get("type", "default")
|
||||||
color = GRAPH_COLORS.get(ntype, GRAPH_COLORS["default"])
|
color = GRAPH_COLORS.get(ntype, GRAPH_COLORS["default"])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user