feat(extract): implement PDF extraction via pdftotext
This commit is contained in:
@@ -1,8 +1,28 @@
|
||||
// ingestion/internal/extract/pdf.go
|
||||
package extract
|
||||
|
||||
import "fmt"
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func extractPDF(_ string) (string, error) {
|
||||
return "", fmt.Errorf("PDF extraction not implemented")
|
||||
// extractPDF runs pdftotext on path and returns the extracted text.
|
||||
// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
|
||||
func extractPDF(path string) (string, error) {
|
||||
cmd := exec.Command("pdftotext", "-q", path, "-")
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
errMsg := strings.TrimSpace(stderr.String())
|
||||
if errMsg == "" {
|
||||
errMsg = err.Error()
|
||||
}
|
||||
return "", fmt.Errorf("pdftotext: %s", errMsg)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(stdout.String()), nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user