remove pdf dependency and replace with pdftotext...

This commit is contained in:
2025-04-22 19:56:59 +02:00
parent 121f340730
commit ba8a9023ea
3 changed files with 137 additions and 170 deletions

View File

@@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"image"
"io"
"log"
"net"
"os"
@@ -22,7 +21,6 @@ import (
pb "github.com/JuLi0n21/thumbnail_service/proto"
"github.com/google/uuid"
"github.com/ledongthuc/pdf"
"github.com/nfnt/resize"
"google.golang.org/grpc"
)
@@ -196,21 +194,14 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
end := time.Since(time.Now())
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
}()
if req.FileType != pb.FileType_PDF {
err := errors.New("unsupported Filetype " + req.FileType.String())
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr(err.Error(), err)
}
file, err := os.CreateTemp("ocr", "temp-file-*")
if err != nil {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr("failed to create temp file", err)
}
filePath := file.Name()
defer func(file *os.File, filePath string) {
@@ -224,57 +215,28 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
_, err = file.Write(req.FileContent)
if err != nil {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr("failed to write file to temp file", err)
}
if ok, err := isScannedPDF(filePath); err != nil {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr("failed to check if file is scanned", err)
} else if !ok {
if isEncrypted(filePath) {
err := decryptPDF(filePath)
if err != nil {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr("failed to decrypt pdf", err)
}
}
err = runOCRMyPDF(filePath)
if err != nil {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
return handleErr("failed to ocr pdf", err)
}
}
var text string
var b []byte
text, b, err = extractTextFromPDF(filePath)
text, b, err := extractTextFromPDF(filePath)
if err != nil {
if strings.Contains("malformed pdf", err.Error()) {
repairPDF(filePath)
text, b, err = extractTextFromPDF(filePath)
} else {
return &pb.OCRFileResponse{
Message: "OCR failed, " + err.Error(),
TextContent: "",
OcrContent: []byte{},
}, err
}
return handleErr("failed to extract text", err)
}
return &pb.OCRFileResponse{
@@ -284,125 +246,13 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
}, nil
}
func isScannedPDF(path string) (bool, error) {
f, r, err := pdf.Open(path)
if err != nil {
return false, fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()
reader, err := r.GetPlainText()
if err != nil {
return false, fmt.Errorf("failed to get PDF text: %w", err)
}
content, err := io.ReadAll(reader)
if err != nil {
return false, fmt.Errorf("failed to read PDF content: %w", err)
}
return len(strings.TrimSpace(string(content))) != 0, nil
}
func runOCRMyPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
defer os.Remove(tempfile.Name())
if err != nil {
return err
}
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
}
processedData, err := os.ReadFile(tempfile.Name())
if err != nil {
return fmt.Errorf("failed to read processed file: %v", err)
}
err = os.WriteFile(inputPath, processedData, 0644)
if err != nil {
return fmt.Errorf("failed to overwrite input file: %v", err)
}
return nil
}
func isEncrypted(pdfPath string) bool {
cmd := exec.Command("qpdf", "--check", pdfPath)
output, err := cmd.CombinedOutput()
if err != nil {
return strings.Contains(string(output), "File is not encrypted")
}
return false
}
func repairPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "")
if err != nil {
return err
}
defer os.Remove(tempfile.Name())
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return err
}
_, err = tempfile.Write(output)
return err
}
func decryptPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "")
if err != nil {
return err
}
defer os.Remove(tempfile.Name())
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
}
processedData, err := os.ReadFile(tempfile.Name())
if err != nil {
return fmt.Errorf("failed to read processed file: %v", err)
}
err = os.WriteFile(inputPath, processedData, 0644)
if err != nil {
return fmt.Errorf("failed to overwrite input file: %v", err)
}
return nil
}
func extractTextFromPDF(path string) (string, []byte, error) {
f, r, err := pdf.Open(path)
if err != nil {
return "", []byte{}, fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()
reader, err := r.GetPlainText()
if err != nil {
return "", []byte{}, fmt.Errorf("failed to get PDF text: %w", err)
}
content, err := io.ReadAll(reader)
if err != nil {
return "", []byte{}, fmt.Errorf("failed to read PDF content: %w", err)
}
rawData, err := os.ReadFile(path)
if err != nil {
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
}
return string(content), rawData, nil
func handleErr(message string, err error) (*pb.OCRFileResponse, error) {
fmt.Println(err)
return &pb.OCRFileResponse{
Message: "OCR failed, " + message,
TextContent: "",
OcrContent: []byte{},
}, err
}
const maxMsgSize = 2147483648 // 2GB

120
server/pdfhelpers.go Normal file
View File

@@ -0,0 +1,120 @@
package main
import (
"fmt"
"os"
"os/exec"
"strings"
)
func isScannedPDF(path string) (bool, error) {
content, _, err := extractTextFromPDF(path)
if err != nil {
return false, err
}
return len(strings.TrimSpace(string(content))) != 0, nil
}
func runOCRMyPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
defer os.Remove(tempfile.Name())
if err != nil {
return err
}
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
}
processedData, err := os.ReadFile(tempfile.Name())
if err != nil {
return fmt.Errorf("failed to read processed file: %v", err)
}
err = os.WriteFile(inputPath, processedData, 0644)
if err != nil {
return fmt.Errorf("failed to overwrite input file: %v", err)
}
return nil
}
func isEncrypted(pdfPath string) bool {
cmd := exec.Command("qpdf", "--check", pdfPath)
output, err := cmd.CombinedOutput()
if err != nil {
return strings.Contains(string(output), "File is not encrypted")
}
return false
}
func repairPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "")
if err != nil {
return err
}
defer os.Remove(tempfile.Name())
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return err
}
_, err = tempfile.Write(output)
return err
}
func decryptPDF(inputPath string) error {
tempfile, err := os.CreateTemp("", "")
if err != nil {
return err
}
defer os.Remove(tempfile.Name())
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
}
processedData, err := os.ReadFile(tempfile.Name())
if err != nil {
return fmt.Errorf("failed to read processed file: %v", err)
}
err = os.WriteFile(inputPath, processedData, 0644)
if err != nil {
return fmt.Errorf("failed to overwrite input file: %v", err)
}
return nil
}
func extractTextFromPDF(path string) (string, []byte, error) {
tmpOut, err := os.CreateTemp("", "pdftotext-*.txt")
if err != nil {
return "", nil, fmt.Errorf("failed to create temp file: %w", err)
}
tmpOut.Close()
defer os.Remove(tmpOut.Name())
cmd := exec.Command("pdftotext", path, tmpOut.Name())
output, err := cmd.CombinedOutput()
if err != nil {
return "", nil, fmt.Errorf("pdftotext failed: %v\nOutput: %s", err, output)
}
data, err := os.ReadFile(tmpOut.Name())
if err != nil {
return "", nil, fmt.Errorf("failed to read pdftotext output: %w", err)
}
rawData, err := os.ReadFile(path)
if err != nil {
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
}
return string(data), rawData, nil
}