feat(extract): implement PDF extraction via pdftotext

2026-04-23 15:53:46 +02:00
parent 43a46d07e5
commit 9cc6c2d053
2 changed files with 45 additions and 3 deletions
--- a/ingestion/internal/extract/extract_test.go
+++ b/ingestion/internal/extract/extract_test.go
@@ -3,6 +3,7 @@ package extract

 import (
 	"os"
+	"os/exec"
 	"path/filepath"
 	"testing"

@@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) {
 	_, err := Text(path)
 	assert.ErrorContains(t, err, "unsupported")
 }
+
+func TestText_PDF(t *testing.T) {
+	if _, err := exec.LookPath("pdftotext"); err != nil {
+		t.Skip("pdftotext not available")
+	}
+	dir := t.TempDir()
+	pdfPath := filepath.Join(dir, "test.pdf")
+
+	// Minimal valid PDF containing the text "Hello PDF".
+	minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
+		"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
+		"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
+		"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
+		"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
+		"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
+	require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
+
+	got, err := Text(pdfPath)
+	require.NoError(t, err)
+	assert.Contains(t, got, "Hello PDF")
+}