feat(extract): add Text() dispatcher with md/txt passthrough
This commit is contained in:
39
ingestion/internal/extract/extract.go
Normal file
39
ingestion/internal/extract/extract.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
// ingestion/internal/extract/extract.go
|
||||||
|
package extract
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Text reads the file at path and returns its plain-text content.
|
||||||
|
// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext).
|
||||||
|
func Text(path string) (string, error) {
|
||||||
|
ext := strings.ToLower(fileExt(path))
|
||||||
|
switch ext {
|
||||||
|
case ".md", ".txt":
|
||||||
|
b, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("read %s: %w", path, err)
|
||||||
|
}
|
||||||
|
return string(b), nil
|
||||||
|
case ".pdf":
|
||||||
|
return extractPDF(path)
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unsupported file extension: %s", ext)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fileExt returns the file extension including the dot, lowercased.
|
||||||
|
func fileExt(path string) string {
|
||||||
|
for i := len(path) - 1; i >= 0; i-- {
|
||||||
|
if path[i] == '.' {
|
||||||
|
return path[i:]
|
||||||
|
}
|
||||||
|
if path[i] == '/' || path[i] == '\\' {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
40
ingestion/internal/extract/extract_test.go
Normal file
40
ingestion/internal/extract/extract_test.go
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
// ingestion/internal/extract/extract_test.go
|
||||||
|
package extract
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestText_Markdown(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "note.md")
|
||||||
|
require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644))
|
||||||
|
|
||||||
|
got, err := Text(path)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, "# Hello\n\nWorld.", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestText_Txt(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "note.txt")
|
||||||
|
require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644))
|
||||||
|
|
||||||
|
got, err := Text(path)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, "plain text", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestText_UnsupportedExtension(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "data.csv")
|
||||||
|
require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644))
|
||||||
|
|
||||||
|
_, err := Text(path)
|
||||||
|
assert.ErrorContains(t, err, "unsupported")
|
||||||
|
}
|
||||||
8
ingestion/internal/extract/pdf.go
Normal file
8
ingestion/internal/extract/pdf.go
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
// ingestion/internal/extract/pdf.go
|
||||||
|
package extract
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
func extractPDF(_ string) (string, error) {
|
||||||
|
return "", fmt.Errorf("PDF extraction not implemented")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user