40 lines
852 B
Go
40 lines
852 B
Go
// ingestion/internal/extract/extract.go
|
|
package extract
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
)
|
|
|
|
// Text reads the file at path and returns its plain-text content.
|
|
// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext).
|
|
func Text(path string) (string, error) {
|
|
ext := strings.ToLower(fileExt(path))
|
|
switch ext {
|
|
case ".md", ".txt":
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read %s: %w", path, err)
|
|
}
|
|
return string(b), nil
|
|
case ".pdf":
|
|
return extractPDF(path)
|
|
default:
|
|
return "", fmt.Errorf("unsupported file extension: %s", ext)
|
|
}
|
|
}
|
|
|
|
// fileExt returns the file extension including the dot, lowercased.
|
|
func fileExt(path string) string {
|
|
for i := len(path) - 1; i >= 0; i-- {
|
|
if path[i] == '.' {
|
|
return path[i:]
|
|
}
|
|
if path[i] == '/' || path[i] == '\\' {
|
|
break
|
|
}
|
|
}
|
|
return ""
|
|
}
|