feat(extract): implement PDF extraction via pdftotext

This commit is contained in:
Mathias Bergqvist
2026-04-23 15:53:46 +02:00
parent 43a46d07e5
commit 9cc6c2d053
2 changed files with 45 additions and 3 deletions

View File

@@ -3,6 +3,7 @@ package extract
import (
"os"
"os/exec"
"path/filepath"
"testing"
@@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) {
_, err := Text(path)
assert.ErrorContains(t, err, "unsupported")
}
func TestText_PDF(t *testing.T) {
if _, err := exec.LookPath("pdftotext"); err != nil {
t.Skip("pdftotext not available")
}
dir := t.TempDir()
pdfPath := filepath.Join(dir, "test.pdf")
// Minimal valid PDF containing the text "Hello PDF".
minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
got, err := Text(pdfPath)
require.NoError(t, err)
assert.Contains(t, got, "Hello PDF")
}