feat: PDF extraction and fuzzy entity resolution
Some checks failed
cd / Build and deploy (push) Successful in 11s
CI / Lint / Test / Vet (push) Failing after 5s
CI / Mirror to GitHub (push) Has been skipped

- New extract package: Text() dispatcher for .md/.txt passthrough and
  PDF extraction via pdftotext subprocess
- wiki.Entry gains Aliases []string, loaded from YAML frontmatter
- Fuzzy entity resolution in pipeline: normalizes titles (lowercase,
  strip articles, collapse hyphens) and matches proposed pages against
  existing inventory slugs and aliases to prevent proliferation
- Watcher and API handler now use extract.Text() instead of os.ReadFile
- Dockerfile: apk add poppler-utils in Alpine runtime stage
This commit is contained in:
Mathias Bergqvist
2026-04-23 16:03:02 +02:00
14 changed files with 1238 additions and 25 deletions

View File

@@ -12,6 +12,7 @@ import (
"time"
"unicode"
"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
)
@@ -99,12 +100,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error {
filename := filepath.Base(path)
source := deriveSource(filename)
content, err := os.ReadFile(path)
content, err := extract.Text(path)
if err != nil {
return fmt.Errorf("read file: %w", err)
return fmt.Errorf("extract text: %w", err)
}
_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false)
_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false)
if runErr != nil {
// Copy to failed/ and leave a .failed marker so we don't retry.
failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")