Long markdown files (>~8KB) silently failed to embed because nomic-embed-text on iguana has a 2048-token context. embed sync logged errors=1 every cycle with no useful body until #37 added per-item logging — three files exceed the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB), litellm-absorption (8.8 KB). Curated knowledge entries should never be vector-blind. Approach: chunk-before-embed, no schema change. vectorstore/chunk.go (new) - ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are further split at paragraph boundaries, packing greedily under budget. - NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded to 4 digits — handles files with up to ~10k sections in stable sort order). - ParentPath strips the chunk suffix for retrieval-side dedup. vectorstore/sync.go - After ChunkMarkdown produces N pieces, each is embedded + upserted as a separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000 (≈1000 nomic tokens, well under the 2048 ceiling with headroom for unicode/code blocks). - "Already embedded?" check now reduces known paths to parent set via ParentPath, so the first chunk hit short-circuits the file. - Delete walk also reduces via ParentPath; when a parent file disappears, every chunk row (and any pre-existing bare-path row, for backward compatibility with rows written before this change) gets dropped. search/search.go - hybridMerge collapses chunk-path vector hits to parent via ParentPath before scope check, RRF accumulation, and hydration. A file with three chunk hits returns one result row, not three. Backward compatibility: pre-existing bare-path rows in brain_embeddings keep working — ParentPath returns them unchanged, knownParents handles them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and search dedup is a no-op for them. No migration required to ship. Tests: - chunk_test.go covers short / heading split / oversized section / content preservation / chunk numbering / parent-path stripping. - sync_test.go adds long-file chunking, single-chunk-row short file, skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file. Existing tests updated for #NNNN paths. - search_test.go adds chunk-paths-dedupe-to-parent. Closes gitea/mathias/infra#38.
73 lines
2.7 KiB
Go
73 lines
2.7 KiB
Go
package vectorstore_test
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestChunkMarkdown_ShortFileFitsInOne(t *testing.T) {
|
|
out := vectorstore.ChunkMarkdown("Just a short paragraph.\n", 4000)
|
|
require.Len(t, out, 1)
|
|
assert.Equal(t, "Just a short paragraph.\n", out[0])
|
|
}
|
|
|
|
func TestChunkMarkdown_SplitsAtHeadings(t *testing.T) {
|
|
src := "# Top\n\nintro\n\n## A\n\nbody a\n\n## B\n\nbody b\n"
|
|
out := vectorstore.ChunkMarkdown(src, 50) // tiny limit forces per-section split
|
|
|
|
assert.GreaterOrEqual(t, len(out), 2, "should split at H2 boundaries")
|
|
// Each chunk should start with a heading (top-level intro chunk OK without one)
|
|
for i, c := range out {
|
|
if i == 0 {
|
|
continue
|
|
}
|
|
assert.True(t, strings.HasPrefix(strings.TrimSpace(c), "#"),
|
|
"non-first chunk %d should start with heading: %q", i, c)
|
|
}
|
|
}
|
|
|
|
func TestChunkMarkdown_FurtherSplitsOversizedSection(t *testing.T) {
|
|
// One H2 section with 4 paragraphs of ~80 chars each, limit 100.
|
|
src := "## big\n\n" +
|
|
strings.Repeat("paragraph one is moderately long.\n\n", 1) +
|
|
strings.Repeat("paragraph two also moderately long.\n\n", 1) +
|
|
strings.Repeat("paragraph three is moderately long.\n\n", 1) +
|
|
strings.Repeat("paragraph four is moderately long.\n\n", 1)
|
|
out := vectorstore.ChunkMarkdown(src, 100)
|
|
|
|
assert.Greater(t, len(out), 1, "oversized section should sub-split at paragraph boundaries")
|
|
for i, c := range out {
|
|
assert.LessOrEqual(t, len(c), 200,
|
|
"chunk %d exceeds 2x maxBytes: %d", i, len(c))
|
|
}
|
|
}
|
|
|
|
func TestChunkMarkdown_PreservesContent(t *testing.T) {
|
|
src := "# H1\n\nfirst section body.\n\n## H2a\n\nsecond section body.\n\n## H2b\n\nthird section body.\n"
|
|
out := vectorstore.ChunkMarkdown(src, 50)
|
|
joined := strings.Join(out, "")
|
|
// All non-whitespace tokens from src must appear in the joined output
|
|
for _, token := range []string{"H1", "first", "H2a", "second", "H2b", "third"} {
|
|
assert.Contains(t, joined, token, "token %q missing after chunking", token)
|
|
}
|
|
}
|
|
|
|
func TestChunkMarkdown_NumberedSuffix(t *testing.T) {
|
|
out := vectorstore.NumberChunks("knowledge/foo.md", []string{"a", "b", "c"})
|
|
require.Len(t, out, 3)
|
|
assert.Equal(t, "knowledge/foo.md#0001", out[0].Path)
|
|
assert.Equal(t, "knowledge/foo.md#0002", out[1].Path)
|
|
assert.Equal(t, "knowledge/foo.md#0003", out[2].Path)
|
|
assert.Equal(t, "a", out[0].Content)
|
|
}
|
|
|
|
func TestParentPath_StripsChunkSuffix(t *testing.T) {
|
|
assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md#0001"))
|
|
assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md"))
|
|
assert.Equal(t, "wiki/a/b.md", vectorstore.ParentPath("wiki/a/b.md#9999"))
|
|
}
|