feat(pipeline): add POST /ingest-raw for direct batch ingestion without LLM

Allows callers to provide pre-structured RawPage data directly, bypassing the LLM extraction step. The pipeline still handles slug computation, frontmatter, link canonicalization, source back-references, and dedup — only the extraction is skipped. Useful when a more capable model or manual curation produces the structured data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 11:15:59 +02:00
parent 3e9a648115
commit 0a70d9e972
6 changed files with 204 additions and 7 deletions
--- a/internal/skills/brain/handlers.go
+++ b/internal/skills/brain/handlers.go
@@ -17,6 +17,8 @@ func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (
 		return s.query(ctx, args)
 	case "brain_write":
 		return s.write(ctx, args)
+	case "brain_ingest_raw":
+		return s.ingestRaw(ctx, args)
 	case "brain_ingest":
 		return s.ingest(ctx, args)
 	case "brain_search":
@@ -98,6 +100,33 @@ func (s *Skill) ingest(ctx context.Context, args json.RawMessage) (json.RawMessa
 	return nil, fmt.Errorf("either content+source or path is required")
 }

+type ingestRawArgs struct {
+	Source string `json:"source"`
+	Pages  []any  `json:"pages"`
+	DryRun bool   `json:"dry_run,omitempty"`
+}
+
+func (s *Skill) ingestRaw(ctx context.Context, args json.RawMessage) (json.RawMessage, error) {
+	var a ingestRawArgs
+	if err := json.Unmarshal(args, &a); err != nil {
+		return nil, fmt.Errorf("parse args: %w", err)
+	}
+	if s.cfg.IngestSvcURL == "" {
+		return nil, fmt.Errorf("brain_ingest_raw: INGEST_SVC_URL not configured")
+	}
+	if a.Source == "" {
+		return nil, fmt.Errorf("source is required")
+	}
+	if len(a.Pages) == 0 {
+		return nil, fmt.Errorf("pages is required and must be non-empty")
+	}
+	return s.postTo(ctx, s.cfg.IngestSvcURL+"/ingest-raw", map[string]any{
+		"source":  a.Source,
+		"pages":   a.Pages,
+		"dry_run": a.DryRun,
+	})
+}
+
 type searchArgs struct {
 	Query      string `json:"query"`
 	Collection string `json:"collection,omitempty"`
--- a/internal/skills/brain/skill.go
+++ b/internal/skills/brain/skill.go
@@ -55,6 +55,32 @@ func (s *Skill) Tools() []registry.ToolDef {
 		},
 	}
 	if s.cfg.IngestSvcURL != "" {
+		tools = append(tools, registry.ToolDef{
+			Name: "brain_ingest_raw",
+			Description: "Ingest pre-structured pages into the brain wiki, bypassing the LLM extraction step. " +
+				"Use when you (the calling agent) have already extracted entities, concepts, and content from a source. " +
+				"Provide source (human-readable name) and pages (array of {title, type, subtype, domain, content} objects). " +
+				"The pipeline computes slugs, paths, frontmatter, wikilink canonicalization, and source back-references. " +
+				"Returns the list of wiki pages written.",
+			InputSchema: schema([]string{"source", "pages"}, map[string]any{
+				"source": map[string]any{"type": "string", "description": "human-readable name for the source, e.g. 'shape-up-book'"},
+				"pages": map[string]any{
+					"type": "array",
+					"items": map[string]any{
+						"type":     "object",
+						"required": []string{"title", "type", "content"},
+						"properties": map[string]any{
+							"title":   map[string]any{"type": "string", "description": "page title, e.g. 'Hash Encoding'"},
+							"type":    map[string]any{"type": "string", "enum": []string{"source", "concept", "entity"}, "description": "page type"},
+							"subtype": map[string]any{"type": "string", "description": "entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project"},
+							"domain":  map[string]any{"type": "string", "description": "knowledge domain, e.g. 'Machine Learning'"},
+							"content": map[string]any{"type": "string", "description": "markdown body — no frontmatter, use [[Display Name]] for wikilinks"},
+						},
+					},
+				},
+				"dry_run": map[string]any{"type": "boolean"},
+			}),
+		})
 		tools = append(tools, registry.ToolDef{
 			Name: "brain_ingest",
 			Description: "Ingest content into the brain wiki (brain/wiki/). Calls an LLM to produce structured wiki pages. " +