feat(extract): add Text() dispatcher with md/txt passthrough
This commit is contained in:
39
ingestion/internal/extract/extract.go
Normal file
39
ingestion/internal/extract/extract.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// ingestion/internal/extract/extract.go
|
||||
package extract
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Text reads the file at path and returns its plain-text content.
|
||||
// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext).
|
||||
func Text(path string) (string, error) {
|
||||
ext := strings.ToLower(fileExt(path))
|
||||
switch ext {
|
||||
case ".md", ".txt":
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read %s: %w", path, err)
|
||||
}
|
||||
return string(b), nil
|
||||
case ".pdf":
|
||||
return extractPDF(path)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported file extension: %s", ext)
|
||||
}
|
||||
}
|
||||
|
||||
// fileExt returns the file extension including the dot, lowercased.
|
||||
func fileExt(path string) string {
|
||||
for i := len(path) - 1; i >= 0; i-- {
|
||||
if path[i] == '.' {
|
||||
return path[i:]
|
||||
}
|
||||
if path[i] == '/' || path[i] == '\\' {
|
||||
break
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
40
ingestion/internal/extract/extract_test.go
Normal file
40
ingestion/internal/extract/extract_test.go
Normal file
@@ -0,0 +1,40 @@
|
||||
// ingestion/internal/extract/extract_test.go
|
||||
package extract
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestText_Markdown(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "note.md")
|
||||
require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644))
|
||||
|
||||
got, err := Text(path)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "# Hello\n\nWorld.", got)
|
||||
}
|
||||
|
||||
func TestText_Txt(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "note.txt")
|
||||
require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644))
|
||||
|
||||
got, err := Text(path)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "plain text", got)
|
||||
}
|
||||
|
||||
func TestText_UnsupportedExtension(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "data.csv")
|
||||
require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644))
|
||||
|
||||
_, err := Text(path)
|
||||
assert.ErrorContains(t, err, "unsupported")
|
||||
}
|
||||
8
ingestion/internal/extract/pdf.go
Normal file
8
ingestion/internal/extract/pdf.go
Normal file
@@ -0,0 +1,8 @@
|
||||
// ingestion/internal/extract/pdf.go
|
||||
package extract
|
||||
|
||||
import "fmt"
|
||||
|
||||
func extractPDF(_ string) (string, error) {
|
||||
return "", fmt.Errorf("PDF extraction not implemented")
|
||||
}
|
||||
Reference in New Issue
Block a user