feat(vectorstore): re-embed on file mtime > store updated_at (#23 )

Removes the TODO in Sync that left files static after their first embed. Edits to brain/wiki/ and brain/knowledge/ now surface in subsequent syncs without manual /backfill-embeddings calls. Approach - Store interface: KnownPaths → KnownPathsWithTime returning path → updated_at. Callers compare against file mtime to detect edits. - PGStore: SELECT path, updated_at FROM brain_embeddings. - Sync groups known chunks by parent path and tracks the EARLIEST updated_at per parent. A file is stale when its mtime is after that oldest chunk's timestamp — any chunk older than the file means at least one chunk hasn't been refreshed since the last edit. - Stale-path rewrite: delete every old chunk for the parent (handles "file shrunk → fewer chunks → orphan rows at higher #NNNN" cleanly), then re-chunk + re-embed + re-upsert. Tests - New: TestSync_ReembedsFileWhenMtimeNewer — file mtime forced into the future vs store updated_at; Sync deletes old chunk + upserts fresh one. - New: TestSync_SkipsFileWhenMtimeOlder — file mtime backdated; Sync is a no-op (no upserts, no deletes). - Updated: stubStore.known is now map[string]time.Time. A zero value resolves to a far-future sentinel so existing "skip if already known" tests keep passing without per-test setup. - pg_test renamed KnownPaths integration → KnownPathsWithTime; asserts updated_at is non-zero and within 5s of insert wall-clock. Backward compat - brain_embeddings rows pre-dating this change carry valid updated_at values (column was always populated via `DEFAULT now()` + ON CONFLICT `updated_at = now()`). No migration needed. Live pod will start re-embedding any file whose source has been edited since its chunks were originally written. Closes gitea/mathias/hyperguild#23.
feat(project_create): mirror_to_github opt-in, default false (infra#34 ADR)
2026-05-20 09:50:45 +02:00 · 2026-05-20 08:35:02 +02:00 · 2026-05-19 21:57:09 +02:00
11 changed files with 598 additions and 88 deletions
--- a/ingestion/internal/search/search.go
+++ b/ingestion/internal/search/search.go
@@ -12,6 +12,7 @@ import (
 	"strings"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
 )
 // VectorSearcher returns the top-limit nearest paths by cosine
@@ -186,17 +187,21 @@ func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 [
 		byPath[r.Path] = r
 	}
 	for rank, h := range hits {
-		if opts.Wing != "" && !pathInScope(h.Path, opts.Wing, opts.Hall) {
+		// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
 		// back to the parent so multiple chunk hits from the same file
 		// score against a single result row.
 		parent := vectorstore.ParentPath(h.Path)
 		if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
 			continue
 		}
-		rrf[h.Path] += 1.0 / (rrfK + float64(rank+1))
+		rrf[parent] += 1.0 / (rrfK + float64(rank+1))
-		if _, seen := byPath[h.Path]; !seen {
+		if _, seen := byPath[parent]; !seen {
-			r, err := hydrate(brainDir, h.Path)
+			r, err := hydrate(brainDir, parent)
 			if err != nil {
-				slog.Warn("search: hydrate failed for vector hit", "path", h.Path, "err", err)
+				slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
 				continue
 			}
-			byPath[h.Path] = r
+			byPath[parent] = r
 		}
 	}
--- a/ingestion/internal/search/search_test.go
+++ b/ingestion/internal/search/search_test.go
@@ -55,6 +55,36 @@ func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
 	assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
 }
 func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
 	dir := t.TempDir()
 	full := filepath.Join(dir, "knowledge", "long.md")
 	require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
 	// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
 	// (it only kicks in when BM25 returns at least one candidate).
 	require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
 	embedder := stubEmbedder{vec: []float32{0.1}}
 	// Vector store returns three chunk-path hits all pointing at the same
 	// parent file. The merged result must surface ONE row per parent — not
 	// three rows with chunk-suffixed paths.
 	vector := stubVector{hits: []search.VectorHit{
 		{Path: "knowledge/long.md#0001", Distance: 0.05},
 		{Path: "knowledge/long.md#0002", Distance: 0.07},
 		{Path: "knowledge/long.md#0003", Distance: 0.09},
 	}}
 	got, err := search.Query(dir, search.QueryOptions{
 		Query:    "alpaca",
 		Limit:    5,
 		Vector:   vector,
 		Embedder: embedder,
 	})
 	require.NoError(t, err)
 	require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
 	assert.Equal(t, "knowledge/long.md", got[0].Path)
 	assert.Equal(t, "Long", got[0].Title)
 }
 func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
--- a/ingestion/internal/vectorstore/chunk.go
+++ b/ingestion/internal/vectorstore/chunk.go
@@ -0,0 +1,137 @@
 package vectorstore
 import (
 	"fmt"
 	"strings"
 )
 // NumberedChunk pairs a chunk's body with the storage path it will use
 // in brain_embeddings. Path format: "<parent>#NNNN" where NNNN is the
 // 1-based chunk index zero-padded to 4 digits.
 type NumberedChunk struct {
 	Path    string
 	Content string
 }
 // ParentPath returns the file path with any "#NNNN" chunk suffix removed.
 // Inputs without a "#" are returned unchanged. Used by search to dedupe
 // chunk-level hits back to a single document per result.
 func ParentPath(p string) string {
 	if i := strings.Index(p, "#"); i >= 0 {
 		return p[:i]
 	}
 	return p
 }
 // NumberChunks assigns "<parent>#NNNN" storage paths to a slice of chunk
 // bodies, indexed from 0001. Empty chunks are dropped.
 func NumberChunks(parent string, chunks []string) []NumberedChunk {
 	out := make([]NumberedChunk, 0, len(chunks))
 	idx := 1
 	for _, c := range chunks {
 		if strings.TrimSpace(c) == "" {
 			continue
 		}
 		out = append(out, NumberedChunk{
 			Path:    fmt.Sprintf("%s#%04d", parent, idx),
 			Content: c,
 		})
 		idx++
 	}
 	return out
 }
 // ChunkMarkdown splits a markdown document into embedding-sized pieces.
 // Strategy:
 //  1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before
 //     the first heading is its own chunk.
 //  2. Any section larger than maxBytes is further split at paragraph
 //     boundaries (blank lines), packing paragraphs greedily under the
 //     byte budget.
 //
 // The function aims for "fits comfortably under nomic-embed-text's 2048-
 // token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000
 // is a safe call-site default.
 func ChunkMarkdown(content string, maxBytes int) []string {
 	if maxBytes <= 0 {
 		maxBytes = 4000
 	}
 	sections := splitAtHeadings(content)
 	out := make([]string, 0, len(sections))
 	for _, s := range sections {
 		if len(s) <= maxBytes {
 			out = append(out, s)
 			continue
 		}
 		out = append(out, splitAtParagraphs(s, maxBytes)...)
 	}
 	return out
 }
 // splitAtHeadings cuts content into sections that each start with an
 // "# " or "## " line (intro before any heading is the leading section).
 func splitAtHeadings(content string) []string {
 	lines := strings.Split(content, "\n")
 	var sections []string
 	var cur strings.Builder
 	flush := func() {
 		if cur.Len() == 0 {
 			return
 		}
 		// Trim all trailing whitespace then re-add a single newline so a
 		// single-paragraph file round-trips to its original content rather
 		// than accumulating extra newlines from the empty-line split.
 		s := strings.TrimRight(cur.String(), "\n")
 		sections = append(sections, s+"\n")
 		cur.Reset()
 	}
 	for _, ln := range lines {
 		trimmed := strings.TrimLeft(ln, " ")
 		isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ")
 		if isH && cur.Len() > 0 {
 			flush()
 		}
 		cur.WriteString(ln)
 		cur.WriteByte('\n')
 	}
 	flush()
 	// Drop empty / whitespace-only trailing section (common when content
 	// itself ends with a "\n" — Split leaves a final empty element).
 	if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" {
 		sections = sections[:n-1]
 	}
 	return sections
 }
 // splitAtParagraphs packs paragraphs (blank-line separated blocks) into
 // sub-chunks of at most maxBytes. A single paragraph that itself exceeds
 // maxBytes is emitted as one over-budget chunk rather than being split
 // mid-sentence — better to over-spend a little than truncate prose.
 func splitAtParagraphs(section string, maxBytes int) []string {
 	paras := strings.Split(section, "\n\n")
 	var out []string
 	var cur strings.Builder
 	for _, p := range paras {
 		if p == "" {
 			continue
 		}
 		// +2 for the "\n\n" rejoin if cur isn't empty
 		need := len(p)
 		if cur.Len() > 0 {
 			need += 2
 		}
 		if cur.Len() > 0 && cur.Len()+need > maxBytes {
 			out = append(out, cur.String())
 			cur.Reset()
 		}
 		if cur.Len() > 0 {
 			cur.WriteString("\n\n")
 		}
 		cur.WriteString(p)
 	}
 	if cur.Len() > 0 {
 		out = append(out, cur.String())
 	}
 	return out
 }
--- a/ingestion/internal/vectorstore/chunk_test.go
+++ b/ingestion/internal/vectorstore/chunk_test.go
@@ -0,0 +1,72 @@
 package vectorstore_test
 import (
 	"strings"
 	"testing"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 func TestChunkMarkdown_ShortFileFitsInOne(t *testing.T) {
 	out := vectorstore.ChunkMarkdown("Just a short paragraph.\n", 4000)
 	require.Len(t, out, 1)
 	assert.Equal(t, "Just a short paragraph.\n", out[0])
 }
 func TestChunkMarkdown_SplitsAtHeadings(t *testing.T) {
 	src := "# Top\n\nintro\n\n## A\n\nbody a\n\n## B\n\nbody b\n"
 	out := vectorstore.ChunkMarkdown(src, 50) // tiny limit forces per-section split
 	assert.GreaterOrEqual(t, len(out), 2, "should split at H2 boundaries")
 	// Each chunk should start with a heading (top-level intro chunk OK without one)
 	for i, c := range out {
 		if i == 0 {
 			continue
 		}
 		assert.True(t, strings.HasPrefix(strings.TrimSpace(c), "#"),
 			"non-first chunk %d should start with heading: %q", i, c)
 	}
 }
 func TestChunkMarkdown_FurtherSplitsOversizedSection(t *testing.T) {
 	// One H2 section with 4 paragraphs of ~80 chars each, limit 100.
 	src := "## big\n\n" +
 		strings.Repeat("paragraph one is moderately long.\n\n", 1) +
 		strings.Repeat("paragraph two also moderately long.\n\n", 1) +
 		strings.Repeat("paragraph three is moderately long.\n\n", 1) +
 		strings.Repeat("paragraph four is moderately long.\n\n", 1)
 	out := vectorstore.ChunkMarkdown(src, 100)
 	assert.Greater(t, len(out), 1, "oversized section should sub-split at paragraph boundaries")
 	for i, c := range out {
 		assert.LessOrEqual(t, len(c), 200,
 			"chunk %d exceeds 2x maxBytes: %d", i, len(c))
 	}
 }
 func TestChunkMarkdown_PreservesContent(t *testing.T) {
 	src := "# H1\n\nfirst section body.\n\n## H2a\n\nsecond section body.\n\n## H2b\n\nthird section body.\n"
 	out := vectorstore.ChunkMarkdown(src, 50)
 	joined := strings.Join(out, "")
 	// All non-whitespace tokens from src must appear in the joined output
 	for _, token := range []string{"H1", "first", "H2a", "second", "H2b", "third"} {
 		assert.Contains(t, joined, token, "token %q missing after chunking", token)
 	}
 }
 func TestChunkMarkdown_NumberedSuffix(t *testing.T) {
 	out := vectorstore.NumberChunks("knowledge/foo.md", []string{"a", "b", "c"})
 	require.Len(t, out, 3)
 	assert.Equal(t, "knowledge/foo.md#0001", out[0].Path)
 	assert.Equal(t, "knowledge/foo.md#0002", out[1].Path)
 	assert.Equal(t, "knowledge/foo.md#0003", out[2].Path)
 	assert.Equal(t, "a", out[0].Content)
 }
 func TestParentPath_StripsChunkSuffix(t *testing.T) {
 	assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md#0001"))
 	assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md"))
 	assert.Equal(t, "wiki/a/b.md", vectorstore.ParentPath("wiki/a/b.md#9999"))
 }
--- a/ingestion/internal/vectorstore/pg.go
+++ b/ingestion/internal/vectorstore/pg.go
@@ -8,6 +8,7 @@ import (
 	"errors"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/jackc/pgx/v5"
 	"github.com/jackc/pgx/v5/pgxpool"
@@ -120,21 +121,26 @@ func (s *PGStore) Search(ctx context.Context, query []float32, limit int) ([]Hit
 	return hits, nil
 }
-// KnownPaths returns the path set already present in the store. Used by
+// KnownPathsWithTime returns every embedded chunk path paired with the
-// the watcher to diff against the wiki/ tree and decide what to upsert.
+// row's updated_at. Sync uses the timestamps to decide whether a file
-func (s *PGStore) KnownPaths(ctx context.Context) (map[string]struct{}, error) {
+// has been edited since its chunks were last embedded — when the file's
-	rows, err := s.pool.Query(ctx, `SELECT path FROM brain_embeddings`)
+// mtime exceeds the oldest chunk's updated_at, the file is re-embedded.
 func (s *PGStore) KnownPathsWithTime(ctx context.Context) (map[string]time.Time, error) {
 	rows, err := s.pool.Query(ctx, `SELECT path, updated_at FROM brain_embeddings`)
 	if err != nil {
 		return nil, fmt.Errorf("query paths: %w", err)
 	}
 	defer rows.Close()
-	out := make(map[string]struct{})
+	out := make(map[string]time.Time)
 	for rows.Next() {
-		var p string
+		var (
-		if err := rows.Scan(&p); err != nil {
+			p string
 			t time.Time
 		)
 		if err := rows.Scan(&p, &t); err != nil {
 			return nil, err
 		}
-		out[p] = struct{}{}
+		out[p] = t
 	}
 	return out, rows.Err()
 }
--- a/ingestion/internal/vectorstore/pg_test.go
+++ b/ingestion/internal/vectorstore/pg_test.go
@@ -36,7 +36,7 @@ func freshStore(t *testing.T) (*vectorstore.PGStore, context.Context) {
 	t.Cleanup(s.Close)
 	require.NoError(t, s.Init(ctx))
 	// Clean slate per test.
-	_, _ = s.KnownPaths(ctx)
+	_, _ = s.KnownPathsWithTime(ctx)
 	require.NoError(t, s.Delete(ctx, "%test-fixture%"))
 	return s, ctx
 }
@@ -67,15 +67,18 @@ func TestIntegration_UpsertAndSearch(t *testing.T) {
 	})
 }
-func TestIntegration_KnownPaths(t *testing.T) {
+func TestIntegration_KnownPathsWithTime(t *testing.T) {
 	s, ctx := freshStore(t)
 	before := time.Now()
 	require.NoError(t, s.Upsert(ctx, "wiki/k.md", vec(768, 0.5)))
 	t.Cleanup(func() { _ = s.Delete(ctx, "wiki/k.md") })
-	paths, err := s.KnownPaths(ctx)
+	paths, err := s.KnownPathsWithTime(ctx)
 	require.NoError(t, err)
-	_, ok := paths["wiki/k.md"]
+	at, ok := paths["wiki/k.md"]
-	assert.True(t, ok)
+	require.True(t, ok)
 	assert.False(t, at.IsZero(), "updated_at must not be zero")
 	assert.WithinDuration(t, before, at, 5*time.Second, "updated_at must be recent")
 }
 func TestUpsert_RejectsWrongDimension(t *testing.T) {
--- a/ingestion/internal/vectorstore/sync.go
+++ b/ingestion/internal/vectorstore/sync.go
@@ -18,7 +18,11 @@ type Embedder interface {
 // Store is the subset of PGStore that Sync needs. Lets tests stub it.
 type Store interface {
-	KnownPaths(ctx context.Context) (map[string]struct{}, error)
+	// KnownPathsWithTime returns every embedded chunk path paired with the
 	// row's updated_at. Sync uses the timestamp to detect edits — a file
 	// whose mtime is newer than ANY of its chunks' updated_at is re-embedded
 	// from scratch (old chunks deleted, fresh chunks upserted).
 	KnownPathsWithTime(ctx context.Context) (map[string]time.Time, error)
 	Upsert(ctx context.Context, path string, embedding []float32) error
 	Delete(ctx context.Context, path string) error
 }
@@ -37,6 +41,13 @@ type SyncResult struct {
 // source pages; knowledge/ holds curated hand-written entries.
 var scanDirs = []string{"wiki", "knowledge"}
 // maxChunkBytes is the per-chunk byte budget passed to ChunkMarkdown.
 // Sized to fit comfortably under nomic-embed-text's 2048-token default
 // context (~4 chars/token for English markdown → ~8 KB ceiling; we sit
 // at 4 KB to leave headroom for unicode, code blocks, and tokenizer
 // variance).
 const maxChunkBytes = 4000
 // Sync brings the embedding store in line with brain/{wiki,knowledge}/
 // on disk:
 //   - new files (in the tree, not in the store) get embedded + upserted
@@ -51,11 +62,33 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 		return res, nil
 	}
-	known, err := store.KnownPaths(ctx)
+	known, err := store.KnownPathsWithTime(ctx)
 	if err != nil {
 		return res, fmt.Errorf("known paths: %w", err)
 	}
-	seen := make(map[string]struct{})
+	// Group known chunks by parent path and remember the EARLIEST
 	// updated_at per parent. A file is considered stale if its mtime is
 	// after the oldest of its chunk rows — i.e. at least one chunk hasn't
 	// been refreshed since the last edit. Also keep the full chunk-path
 	// list per parent so we can delete every old chunk before re-embedding
 	// (handles "file shrunk → fewer chunks → orphan rows" cleanly).
 	type parentState struct {
 		minUpdatedAt time.Time
 		chunkPaths   []string
 	}
 	parents := make(map[string]*parentState, len(known))
 	for p, t := range known {
 		parent := ParentPath(p)
 		ps, ok := parents[parent]
 		if !ok {
 			ps = &parentState{minUpdatedAt: t}
 			parents[parent] = ps
 		} else if t.Before(ps.minUpdatedAt) {
 			ps.minUpdatedAt = t
 		}
 		ps.chunkPaths = append(ps.chunkPaths, p)
 	}
 	seenParents := make(map[string]struct{})
 	for _, sub := range scanDirs {
 		root := filepath.Join(brainDir, sub)
@@ -75,12 +108,28 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 				return err
 			}
 			relSlash := filepath.ToSlash(rel)
-			seen[relSlash] = struct{}{}
+			seenParents[relSlash] = struct{}{}
-			if _, ok := known[relSlash]; ok {
+			if ps, ok := parents[relSlash]; ok {
-				// Already embedded — TODO: compare mtime once Store exposes
+				// File already has chunks in the store. Re-embed only when
-				// updated_at so we re-embed on edit. For now, skip.
+				// the file has been edited since the oldest chunk was
-				return nil
+				// written. Tolerate clock skew with a sub-second grace.
 				info, statErr := d.Info()
 				if statErr != nil {
 					res.Errors = append(res.Errors, fmt.Errorf("stat %s: %w", relSlash, statErr))
 					return nil
 				}
 				if !info.ModTime().After(ps.minUpdatedAt) {
 					return nil
 				}
 				// Stale: delete old chunks before re-embedding so a shrunk
 				// file doesn't leave orphan rows at higher #NNNN indexes.
 				for _, oldPath := range ps.chunkPaths {
 					if delErr := store.Delete(ctx, oldPath); delErr != nil {
 						res.Errors = append(res.Errors, fmt.Errorf("delete %s for re-embed: %w", oldPath, delErr))
 						return nil
 					}
 				}
 			}
 			content, readErr := os.ReadFile(path)
@@ -88,16 +137,19 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 				res.Errors = append(res.Errors, fmt.Errorf("read %s: %w", relSlash, readErr))
 				return nil
 			}
-			vec, embErr := embedder.Embed(ctx, string(content))
+			chunks := NumberChunks(relSlash, ChunkMarkdown(string(content), maxChunkBytes))
-			if embErr != nil {
+			for _, ch := range chunks {
-				res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", relSlash, embErr))
+				vec, embErr := embedder.Embed(ctx, ch.Content)
-				return nil
+				if embErr != nil {
 					res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", ch.Path, embErr))
 					continue
 				}
 				if upErr := store.Upsert(ctx, ch.Path, vec); upErr != nil {
 					res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", ch.Path, upErr))
 					continue
 				}
 				res.Added++
 			}
 			if upErr := store.Upsert(ctx, relSlash, vec); upErr != nil {
 				res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", relSlash, upErr))
 				return nil
 			}
 			res.Added++
 			return nil
 		})
 		if err != nil {
@@ -105,9 +157,9 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 		}
 	}
-	// Drop rows whose file is gone.
+	// Drop chunk rows whose parent file is gone.
 	for path := range known {
-		if _, ok := seen[path]; ok {
+		if _, ok := seenParents[ParentPath(path)]; ok {
 			continue
 		}
 		if err := store.Delete(ctx, path); err != nil {
--- a/ingestion/internal/vectorstore/sync_test.go
+++ b/ingestion/internal/vectorstore/sync_test.go
@@ -5,7 +5,9 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
 	"github.com/stretchr/testify/assert"
@@ -13,16 +15,27 @@ import (
 )
 type stubStore struct {
-	known    map[string]struct{}
+	// known maps chunk-path → updated_at. Tests that don't care about
 	// re-embed-on-mtime use a far-future time so the Sync skip path
 	// always wins. Tests that do exercise the mtime path set the
 	// updated_at explicitly.
 	known    map[string]time.Time
 	upserts  map[string][]float32
 	deletes  []string
 	failNext error
 }
-func (s *stubStore) KnownPaths(_ context.Context) (map[string]struct{}, error) {
+// farFuture is "newer than any file mtime", used as the default
-	out := make(map[string]struct{}, len(s.known))
+// updated_at in stubs that don't care about re-embed behavior.
-	for k := range s.known {
+var farFuture = time.Now().Add(24 * time.Hour)
-		out[k] = struct{}{}
+
 func (s *stubStore) KnownPathsWithTime(_ context.Context) (map[string]time.Time, error) {
 	out := make(map[string]time.Time, len(s.known))
 	for k, t := range s.known {
 		if t.IsZero() {
 			t = farFuture
 		}
 		out[k] = t
 	}
 	return out, nil
 }
@@ -66,21 +79,21 @@ func TestSync_AddsNewFiles(t *testing.T) {
 	writeNote(t, dir, "wiki/jepa-fx/facts/x.md", "body of x")
 	writeNote(t, dir, "wiki/jepa-fx/facts/y.md", "body of y")
-	store := &stubStore{known: map[string]struct{}{}}
+	store := &stubStore{known: map[string]time.Time{}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 2, res.Added)
 	assert.Empty(t, res.Deleted)
-	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md")
+	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001")
-	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md")
+	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001")
 }
 func TestSync_SkipsAlreadyKnown(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/a/facts/x.md", "x")
-	store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md": {}}}
+	store := &stubStore{known: map[string]time.Time{"wiki/a/facts/x.md#0001": {}}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
@@ -92,7 +105,7 @@ func TestSync_DeletesDisappearedFiles(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
 	// store has a path that doesn't exist on disk anymore
-	store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md": {}}}
+	store := &stubStore{known: map[string]time.Time{"wiki/old/facts/ghost.md#0001": {}}}
 	res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Deleted)
@@ -110,11 +123,11 @@ func TestSync_SkipsIndexFiles(t *testing.T) {
 	writeNote(t, dir, "wiki/a/_index.md", "moc")
 	writeNote(t, dir, "wiki/a/facts/real.md", "body")
-	store := &stubStore{known: map[string]struct{}{}}
+	store := &stubStore{known: map[string]time.Time{}}
 	res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Added)
-	assert.NotContains(t, store.upserts, "wiki/a/_index.md")
+	assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001")
 }
 func TestSync_ScansKnowledgeDir(t *testing.T) {
@@ -122,13 +135,123 @@ func TestSync_ScansKnowledgeDir(t *testing.T) {
 	writeNote(t, dir, "wiki/a/facts/x.md", "x")
 	writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body")
-	store := &stubStore{known: map[string]struct{}{}}
+	store := &stubStore{known: map[string]time.Time{}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 2, res.Added)
-	assert.Contains(t, store.upserts, "wiki/a/facts/x.md")
+	assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001")
-	assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md")
+	assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001")
 }
 func TestSync_ChunksLongFiles(t *testing.T) {
 	dir := t.TempDir()
 	// Build a file that's well over the chunk byte budget. Multi-section
 	// markdown so the chunker has heading boundaries to cut on.
 	body := "# Doc\n\nintro line.\n\n"
 	for i := 0; i < 10; i++ {
 		body += "## Section " + string(rune('A'+i)) + "\n\n"
 		body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n"
 	}
 	writeNote(t, dir, "knowledge/long.md", body)
 	store := &stubStore{known: map[string]time.Time{}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows")
 	// Every upserted path for this file must be a chunk path.
 	chunkCount := 0
 	for p := range store.upserts {
 		if strings.HasPrefix(p, "knowledge/long.md#") {
 			chunkCount++
 		}
 	}
 	assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed")
 	// The bare parent path must NOT be upserted directly.
 	assert.NotContains(t, store.upserts, "knowledge/long.md")
 }
 func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/short.md", "tiny body\n")
 	store := &stubStore{known: map[string]time.Time{}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Added)
 	assert.Contains(t, store.upserts, "wiki/short.md#0001")
 }
 func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/foo.md", "body\n")
 	store := &stubStore{known: map[string]time.Time{
 		"wiki/foo.md#0001": {},
 	}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 0, res.Added)
 	assert.Empty(t, store.upserts)
 }
 func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
 	store := &stubStore{known: map[string]time.Time{
 		"wiki/ghost.md#0001": {},
 		"wiki/ghost.md#0002": {},
 		"wiki/ghost.md#0003": {},
 	}}
 	res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
 	require.NoError(t, err)
 	assert.Equal(t, 3, res.Deleted)
 }
 func TestSync_ReembedsFileWhenMtimeNewer(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/edited.md", "original body\n")
 	// Force the file's mtime ahead of any plausible store updated_at.
 	future := time.Now().Add(1 * time.Hour)
 	require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/edited.md"), future, future))
 	store := &stubStore{
 		known: map[string]time.Time{
 			// Existing chunk row pre-dates the file's mtime.
 			"wiki/edited.md#0001": time.Now().Add(-1 * time.Hour),
 		},
 	}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Added, "file with newer mtime should be re-embedded")
 	assert.Contains(t, store.upserts, "wiki/edited.md#0001")
 	// Old chunks of the same parent must be deleted before re-embed so
 	// shrunk files don't leave orphan rows at higher #NNNN indexes.
 	assert.Contains(t, store.deletes, "wiki/edited.md#0001")
 }
 func TestSync_SkipsFileWhenMtimeOlder(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/stable.md", "body\n")
 	// Backdate mtime to before the store's recorded updated_at.
 	past := time.Now().Add(-2 * time.Hour)
 	require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/stable.md"), past, past))
 	store := &stubStore{
 		known: map[string]time.Time{
 			"wiki/stable.md#0001": time.Now(),
 		},
 	}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 0, res.Added)
 	assert.Empty(t, store.upserts)
 	assert.Empty(t, store.deletes)
 }
 func TestSync_NoOpWhenComponentsNil(t *testing.T) {
@@ -142,7 +265,7 @@ func TestSync_NoOpWhenComponentsNil(t *testing.T) {
 func TestSync_CollectsEmbedderErrors(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/a/facts/x.md", "x")
-	store := &stubStore{known: map[string]struct{}{}}
+	store := &stubStore{known: map[string]time.Time{}}
 	emb := stubEmbedder{err: errors.New("upstream down")}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
--- a/internal/skills/project/handlers.go
+++ b/internal/skills/project/handlers.go
@@ -13,12 +13,13 @@ import (
 )
 type createArgs struct {
-	Name        string `json:"name"`
+	Name           string `json:"name"`
-	Description string `json:"description"`
+	Description    string `json:"description"`
-	Hypothesis  string `json:"hypothesis"`
+	Hypothesis     string `json:"hypothesis"`
-	Folder      string `json:"folder"`
+	Folder         string `json:"folder"`
-	Stack       string `json:"stack"`
+	Stack          string `json:"stack"`
-	Private     bool   `json:"private"`
+	Private        bool   `json:"private"`
 	MirrorToGitHub bool   `json:"mirror_to_github,omitempty"`
 }
 type createResult struct {
@@ -59,11 +60,12 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
 	tmpl := templateFor(args.Stack)
 	giteaURL := fmt.Sprintf("http://gitea.d-ma.be/%s/%s", s.cfg.GiteaOwner, args.Name)
 	githubURL := fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)
 	res := createResult{
-		GiteaURL:  giteaURL,
+		GiteaURL: giteaURL,
-		GitHubURL: githubURL,
+	}
 	if args.MirrorToGitHub {
 		res.GitHubURL = fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)
 	}
 	// Step 1: create_project_from_template. If the repo already exists,
@@ -75,25 +77,32 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
 	}
 	res.Reached = append(res.Reached, stepCreateRepo)
-	// Step 2: create empty GitHub repo. Gitea's push-mirror cannot push
+	// Steps 2+3 are skipped when MirrorToGitHub is false. Default per
-	// to a non-existent remote, so the destination must exist before
+	// infra ADR (Gitea as true master, GitHub as optional opt-in): keep
-	// step 3 configures the mirror. Skipped when GitHub client is unset
+	// client / business-logic / personal repos Gitea-only. Set
-	// (degraded mode — see Config.GitHub doc).
+	// `mirror_to_github: true` for open-source projects that want a
-	if s.cfg.GitHub != nil {
+	// public GitHub mirror (hyperguild, gitea-mcp, template-*).
-		if err := s.callCreateGitHubRepo(ctx, args); err != nil && !errors.Is(err, githubclient.ErrAlreadyExists) {
+	if args.MirrorToGitHub {
-			return marshalPartial(res, stepCreateGitHub, err)
+		// Step 2: create empty GitHub repo. Gitea's push-mirror cannot push
 		// to a non-existent remote, so the destination must exist before
 		// step 3 configures the mirror. Skipped when GitHub client is unset
 		// (degraded mode — see Config.GitHub doc).
 		if s.cfg.GitHub != nil {
 			if err := s.callCreateGitHubRepo(ctx, args); err != nil && !errors.Is(err, githubclient.ErrAlreadyExists) {
 				return marshalPartial(res, stepCreateGitHub, err)
 			}
 			res.Reached = append(res.Reached, stepCreateGitHub)
 		}
 		res.Reached = append(res.Reached, stepCreateGitHub)
 	}
-	// Step 3: configure push mirror to GitHub. Idempotent: if a mirror with
+		// Step 3: configure push mirror to GitHub. Idempotent: if a mirror with
-	// the same remote already exists, gitea-mcp returns Conflict; we swallow it.
+		// the same remote already exists, gitea-mcp returns Conflict; we swallow it.
-	if err := s.callMirror(ctx, args.Name); err != nil {
+		if err := s.callMirror(ctx, args.Name); err != nil {
-		if !isConflict(err) {
+			if !isConflict(err) {
-			return marshalPartial(res, stepMirror, err)
+				return marshalPartial(res, stepMirror, err)
 			}
 		}
 		res.Reached = append(res.Reached, stepMirror)
 	}
 	res.Reached = append(res.Reached, stepMirror)
 	// Step 3: commit staging namespace manifest to infra repo. Done before
 	// the issue so the staging env is reconciling by the time the issue lands.
@@ -228,7 +237,11 @@ func experimentBrief(args createArgs, existed bool) string {
 	b.WriteString("- Repo created from `template-")
 	b.WriteString(args.Stack)
 	b.WriteString("` on Gitea.\n")
-	b.WriteString("- Push-mirror configured to GitHub.\n")
+	if args.MirrorToGitHub {
 		b.WriteString("- Push-mirror configured to GitHub.\n")
 	} else {
 		b.WriteString("- Gitea-only (no GitHub mirror — set `mirror_to_github: true` to opt in).\n")
 	}
 	b.WriteString("- Staging namespace manifest committed to infra repo.\n\n")
 	if existed {
 		b.WriteString("> Note: this repo already existed when `project_create` ran — provisioning steps were re-applied idempotently.\n")
--- a/internal/skills/project/handlers_test.go
+++ b/internal/skills/project/handlers_test.go
@@ -158,6 +158,9 @@ func mustClient(t *testing.T, url string) *mcpclient.Client {
 	return c
 }
 // happyArgs returns the minimal valid request. With the Gitea-as-true-master
 // ADR shipped, this defaults to Gitea-only (mirror_to_github omitted = false).
 // Tests that need the full Gitea + GitHub mirror flow use mirroredArgs().
 func happyArgs() json.RawMessage {
 	return json.RawMessage(`{
 		"name":"my-experiment",
@@ -169,6 +172,20 @@ func happyArgs() json.RawMessage {
 	}`)
 }
 // mirroredArgs is happyArgs + mirror_to_github=true — the explicit opt-in
 // path. Equivalent to the pre-ADR default.
 func mirroredArgs() json.RawMessage {
 	return json.RawMessage(`{
 		"name":"my-experiment",
 		"description":"One-line desc",
 		"hypothesis":"We believe X produces Y",
 		"folder":"AGENTS",
 		"stack":"go-agent",
 		"private":true,
 		"mirror_to_github":true
 	}`)
 }
 func TestProjectCreate_HappyPath(t *testing.T) {
 	f := &fakeGiteaMCP{
 		Responses: map[string]any{
@@ -177,7 +194,7 @@ func TestProjectCreate_HappyPath(t *testing.T) {
 	}
 	skill, gh := newSkill(t, f)
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)
 	var res map[string]any
@@ -228,7 +245,7 @@ func TestProjectCreate_GitHubExists_Idempotent(t *testing.T) {
 	skill, gh := newSkill(t, f)
 	gh.ReturnError = 422 // already exists
-	_, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	_, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err, "422 already-exists should be idempotent")
 	require.Len(t, f.Calls, 4, "all gitea steps still run despite github 422")
 }
@@ -238,7 +255,7 @@ func TestProjectCreate_GitHubFails(t *testing.T) {
 	skill, gh := newSkill(t, f)
 	gh.ReturnError = 401 // bad PAT
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)
 	var res map[string]any
 	require.NoError(t, json.Unmarshal(out, &res))
@@ -255,7 +272,11 @@ func TestProjectCreate_NoGitHubClient_DegradedMode(t *testing.T) {
 	}
 	skill := newSkillNoGitHub(t, f)
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	// Use mirroredArgs so we exercise the GitHub-mirror path. With the
 	// GitHub client nil, the create_github_repo step is skipped but the
 	// mirror step still attempts to configure the push-mirror remote
 	// (degraded mode preserves the prior contract for opted-in projects).
 	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)
 	var res map[string]any
 	require.NoError(t, json.Unmarshal(out, &res))
@@ -275,7 +296,7 @@ func TestProjectCreate_Idempotent_RepoExists(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)
 	var res map[string]any
@@ -295,7 +316,7 @@ func TestProjectCreate_MirrorFails(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), `"mirror" failed`)
@@ -317,7 +338,7 @@ func TestProjectCreate_InfraCommitFails(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)
-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)
 	var res map[string]any
@@ -351,6 +372,45 @@ func TestProjectCreate_ValidationErrors(t *testing.T) {
 	assert.Empty(t, f.Calls, "no upstream calls should occur on validation failure")
 }
 func TestProjectCreate_DefaultSkipsGitHubMirror(t *testing.T) {
 	// Default (mirror_to_github omitted) skips create_github_repo + mirror
 	// per the Gitea-as-true-master ADR. Gitea repo + staging namespace
 	// + issue still run; github_url is empty in the response.
 	f := &fakeGiteaMCP{
 		Responses: map[string]any{
 			"issue_create": map[string]any{"html_url": "http://gitea.d-ma.be/mathias/my-experiment/issues/1"},
 		},
 	}
 	skill, gh := newSkill(t, f)
 	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
 	require.NoError(t, err)
 	var res map[string]any
 	require.NoError(t, json.Unmarshal(out, &res))
 	assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment", res["gitea_url"])
 	assert.Equal(t, "", res["github_url"], "github_url must be empty when mirror not opted in")
 	assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment/issues/1", res["issue_url"])
 	// 3 gitea-mcp calls: template create, staging file write, issue. NO mirror call.
 	require.Len(t, f.Calls, 3)
 	assert.Equal(t, "create_project_from_template", f.Calls[0].Tool)
 	assert.Equal(t, "file_write_branch", f.Calls[1].Tool)
 	assert.Equal(t, "issue_create", f.Calls[2].Tool)
 	// Zero GitHub API calls.
 	assert.Empty(t, gh.Calls, "no GitHub repo created when mirror_to_github is false")
 	// reached lists the Gitea-only path.
 	reached := res["reached"].([]any)
 	assert.Equal(t, []any{"create_repo", "infra_commit", "issue"}, reached)
 	// experiment-brief body reflects Gitea-only provisioning.
 	require.Contains(t, f.Calls[2].Args["body"], "Gitea-only")
 	require.NotContains(t, f.Calls[2].Args["body"], "Push-mirror configured")
 }
 func TestProjectCreate_UnknownTool(t *testing.T) {
 	f := &fakeGiteaMCP{}
 	skill, _ := newSkill(t, f)
--- a/internal/skills/project/skill.go
+++ b/internal/skills/project/skill.go
@@ -79,13 +79,22 @@ func (s *Skill) Tools() []registry.ToolDef {
 				"description": "Selects template-go-agent or template-go-web.",
 			},
 			"private": map[string]any{"type": "boolean"},
 			"mirror_to_github": map[string]any{
 				"type": "boolean",
 				"description": "Default false. When true, also create an empty GitHub repo " +
 					"and configure a push-mirror from Gitea. Opt-in per the Gitea-as-true-master " +
 					"ADR — only set true for open-source projects (hyperguild, gitea-mcp, template-*). " +
 					"Never set true for client projects, business logic, or personal experiments.",
 			},
 		},
 		"required": []string{"name", "description", "hypothesis", "stack"},
 	})
 	return []registry.ToolDef{
 		{
-			Name:        "project_create",
+			Name: "project_create",
-			Description: "Bootstrap a new project: Gitea repo from template, GitHub push-mirror, staging namespace manifest, experiment-brief issue. Idempotent — re-running with an existing repo returns the existing URLs.",
+			Description: "Bootstrap a new project: Gitea repo from template, staging namespace manifest, " +
 				"experiment-brief issue. Optionally mirrors to GitHub when `mirror_to_github: true` " +
 				"(default false). Idempotent — re-running with an existing repo returns the existing URLs.",
 			InputSchema: schema,
 		},
 	}