29 lines
677 B
Go
29 lines
677 B
Go
// ingestion/internal/extract/pdf.go
|
|
package extract
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"os/exec"
|
|
"strings"
|
|
)
|
|
|
|
// extractPDF runs pdftotext on path and returns the extracted text.
|
|
// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
|
|
func extractPDF(path string) (string, error) {
|
|
cmd := exec.Command("pdftotext", "-q", path, "-")
|
|
var stdout, stderr bytes.Buffer
|
|
cmd.Stdout = &stdout
|
|
cmd.Stderr = &stderr
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
errMsg := strings.TrimSpace(stderr.String())
|
|
if errMsg == "" {
|
|
errMsg = err.Error()
|
|
}
|
|
return "", fmt.Errorf("pdftotext: %s", errMsg)
|
|
}
|
|
|
|
return strings.TrimSpace(stdout.String()), nil
|
|
}
|