// ingestion/internal/extract/extract_test.go package extract import ( "os" "os/exec" "path/filepath" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestText_Markdown(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "note.md") require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644)) got, err := Text(path) require.NoError(t, err) assert.Equal(t, "# Hello\n\nWorld.", got) } func TestText_Txt(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "note.txt") require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644)) got, err := Text(path) require.NoError(t, err) assert.Equal(t, "plain text", got) } func TestText_UnsupportedExtension(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "data.csv") require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644)) _, err := Text(path) assert.ErrorContains(t, err, "unsupported") } func TestText_PDF(t *testing.T) { if _, err := exec.LookPath("pdftotext"); err != nil { t.Skip("pdftotext not available") } dir := t.TempDir() pdfPath := filepath.Join(dir, "test.pdf") // Minimal valid PDF containing the text "Hello PDF". minimalPDF := "%PDF-1.4\n1 0 obj<>endobj\n" + "2 0 obj<>endobj\n" + "3 0 obj<>>>>>>>endobj\n" + "4 0 obj<>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" + "xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" + "trailer<>\nstartxref\n406\n%%EOF\n" require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644)) got, err := Text(pdfPath) require.NoError(t, err) assert.Contains(t, got, "Hello PDF") }