package vectorstore import ( "fmt" "strings" ) // NumberedChunk pairs a chunk's body with the storage path it will use // in brain_embeddings. Path format: "#NNNN" where NNNN is the // 1-based chunk index zero-padded to 4 digits. type NumberedChunk struct { Path string Content string } // ParentPath returns the file path with any "#NNNN" chunk suffix removed. // Inputs without a "#" are returned unchanged. Used by search to dedupe // chunk-level hits back to a single document per result. func ParentPath(p string) string { if i := strings.Index(p, "#"); i >= 0 { return p[:i] } return p } // NumberChunks assigns "#NNNN" storage paths to a slice of chunk // bodies, indexed from 0001. Empty chunks are dropped. func NumberChunks(parent string, chunks []string) []NumberedChunk { out := make([]NumberedChunk, 0, len(chunks)) idx := 1 for _, c := range chunks { if strings.TrimSpace(c) == "" { continue } out = append(out, NumberedChunk{ Path: fmt.Sprintf("%s#%04d", parent, idx), Content: c, }) idx++ } return out } // ChunkMarkdown splits a markdown document into embedding-sized pieces. // Strategy: // 1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before // the first heading is its own chunk. // 2. Any section larger than maxBytes is further split at paragraph // boundaries (blank lines), packing paragraphs greedily under the // byte budget. // // The function aims for "fits comfortably under nomic-embed-text's 2048- // token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000 // is a safe call-site default. func ChunkMarkdown(content string, maxBytes int) []string { if maxBytes <= 0 { maxBytes = 4000 } sections := splitAtHeadings(content) out := make([]string, 0, len(sections)) for _, s := range sections { if len(s) <= maxBytes { out = append(out, s) continue } out = append(out, splitAtParagraphs(s, maxBytes)...) } return out } // splitAtHeadings cuts content into sections that each start with an // "# " or "## " line (intro before any heading is the leading section). func splitAtHeadings(content string) []string { lines := strings.Split(content, "\n") var sections []string var cur strings.Builder flush := func() { if cur.Len() == 0 { return } // Trim all trailing whitespace then re-add a single newline so a // single-paragraph file round-trips to its original content rather // than accumulating extra newlines from the empty-line split. s := strings.TrimRight(cur.String(), "\n") sections = append(sections, s+"\n") cur.Reset() } for _, ln := range lines { trimmed := strings.TrimLeft(ln, " ") isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ") if isH && cur.Len() > 0 { flush() } cur.WriteString(ln) cur.WriteByte('\n') } flush() // Drop empty / whitespace-only trailing section (common when content // itself ends with a "\n" — Split leaves a final empty element). if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" { sections = sections[:n-1] } return sections } // splitAtParagraphs packs paragraphs (blank-line separated blocks) into // sub-chunks of at most maxBytes. A single paragraph that itself exceeds // maxBytes is emitted as one over-budget chunk rather than being split // mid-sentence — better to over-spend a little than truncate prose. func splitAtParagraphs(section string, maxBytes int) []string { paras := strings.Split(section, "\n\n") var out []string var cur strings.Builder for _, p := range paras { if p == "" { continue } // +2 for the "\n\n" rejoin if cur isn't empty need := len(p) if cur.Len() > 0 { need += 2 } if cur.Len() > 0 && cur.Len()+need > maxBytes { out = append(out, cur.String()) cur.Reset() } if cur.Len() > 0 { cur.WriteString("\n\n") } cur.WriteString(p) } if cur.Len() > 0 { out = append(out, cur.String()) } return out }