From ba8a9023eaaeee01f4914b5a3e8a40985d129e3f Mon Sep 17 00:00:00 2001 From: JuLi0n21 Date: Tue, 22 Apr 2025 19:56:59 +0200 Subject: [PATCH] remove pdf dependency and replace with pdftotext... --- client/main.go | 5 +- server/main.go | 182 ++++--------------------------------------- server/pdfhelpers.go | 120 ++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 170 deletions(-) create mode 100644 server/pdfhelpers.go diff --git a/client/main.go b/client/main.go index 898abd2..6fdfd25 100644 --- a/client/main.go +++ b/client/main.go @@ -66,10 +66,7 @@ func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServic MaxHeight: 150, } - ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) - defer cancel() - - resp, err := client.GenerateThumbnail(ctx, req) + resp, err := client.GenerateThumbnail(context.Background(), req) if err != nil { log.Fatalf("Error calling GenerateThumbnail: %v", err) } diff --git a/server/main.go b/server/main.go index 2ddabc9..f0f196e 100644 --- a/server/main.go +++ b/server/main.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "image" - "io" "log" "net" "os" @@ -22,7 +21,6 @@ import ( pb "github.com/JuLi0n21/thumbnail_service/proto" "github.com/google/uuid" - "github.com/ledongthuc/pdf" "github.com/nfnt/resize" "google.golang.org/grpc" ) @@ -196,21 +194,14 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi end := time.Since(time.Now()) fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType) }() + if req.FileType != pb.FileType_PDF { err := errors.New("unsupported Filetype " + req.FileType.String()) - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr(err.Error(), err) } file, err := os.CreateTemp("ocr", "temp-file-*") if err != nil { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr("failed to create temp file", err) } filePath := file.Name() defer func(file *os.File, filePath string) { @@ -224,57 +215,28 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi _, err = file.Write(req.FileContent) if err != nil { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr("failed to write file to temp file", err) } if ok, err := isScannedPDF(filePath); err != nil { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr("failed to check if file is scanned", err) } else if !ok { if isEncrypted(filePath) { err := decryptPDF(filePath) if err != nil { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr("failed to decrypt pdf", err) } } err = runOCRMyPDF(filePath) if err != nil { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err + return handleErr("failed to ocr pdf", err) } } - var text string - var b []byte - text, b, err = extractTextFromPDF(filePath) + text, b, err := extractTextFromPDF(filePath) if err != nil { - - if strings.Contains("malformed pdf", err.Error()) { - repairPDF(filePath) - text, b, err = extractTextFromPDF(filePath) - - } else { - return &pb.OCRFileResponse{ - Message: "OCR failed, " + err.Error(), - TextContent: "", - OcrContent: []byte{}, - }, err - } + return handleErr("failed to extract text", err) } return &pb.OCRFileResponse{ @@ -284,125 +246,13 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi }, nil } -func isScannedPDF(path string) (bool, error) { - f, r, err := pdf.Open(path) - if err != nil { - return false, fmt.Errorf("failed to open PDF: %w", err) - } - defer f.Close() - - reader, err := r.GetPlainText() - if err != nil { - return false, fmt.Errorf("failed to get PDF text: %w", err) - } - - content, err := io.ReadAll(reader) - if err != nil { - return false, fmt.Errorf("failed to read PDF content: %w", err) - } - - return len(strings.TrimSpace(string(content))) != 0, nil -} - -func runOCRMyPDF(inputPath string) error { - tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf") - defer os.Remove(tempfile.Name()) - if err != nil { - return err - } - cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name()) - output, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output) - } - - processedData, err := os.ReadFile(tempfile.Name()) - if err != nil { - return fmt.Errorf("failed to read processed file: %v", err) - } - - err = os.WriteFile(inputPath, processedData, 0644) - if err != nil { - return fmt.Errorf("failed to overwrite input file: %v", err) - } - return nil -} - -func isEncrypted(pdfPath string) bool { - cmd := exec.Command("qpdf", "--check", pdfPath) - output, err := cmd.CombinedOutput() - if err != nil { - return strings.Contains(string(output), "File is not encrypted") - } - return false -} - -func repairPDF(inputPath string) error { - tempfile, err := os.CreateTemp("", "") - if err != nil { - return err - } - defer os.Remove(tempfile.Name()) - - cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name()) - output, err := cmd.CombinedOutput() - if err != nil { - return err - } - - _, err = tempfile.Write(output) - - return err -} - -func decryptPDF(inputPath string) error { - tempfile, err := os.CreateTemp("", "") - if err != nil { - return err - } - defer os.Remove(tempfile.Name()) - - cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name()) - output, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output) - } - - processedData, err := os.ReadFile(tempfile.Name()) - if err != nil { - return fmt.Errorf("failed to read processed file: %v", err) - } - - err = os.WriteFile(inputPath, processedData, 0644) - if err != nil { - return fmt.Errorf("failed to overwrite input file: %v", err) - } - return nil -} - -func extractTextFromPDF(path string) (string, []byte, error) { - f, r, err := pdf.Open(path) - if err != nil { - return "", []byte{}, fmt.Errorf("failed to open PDF: %w", err) - } - defer f.Close() - - reader, err := r.GetPlainText() - if err != nil { - return "", []byte{}, fmt.Errorf("failed to get PDF text: %w", err) - } - - content, err := io.ReadAll(reader) - if err != nil { - return "", []byte{}, fmt.Errorf("failed to read PDF content: %w", err) - } - - rawData, err := os.ReadFile(path) - if err != nil { - return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err) - } - - return string(content), rawData, nil +func handleErr(message string, err error) (*pb.OCRFileResponse, error) { + fmt.Println(err) + return &pb.OCRFileResponse{ + Message: "OCR failed, " + message, + TextContent: "", + OcrContent: []byte{}, + }, err } const maxMsgSize = 2147483648 // 2GB diff --git a/server/pdfhelpers.go b/server/pdfhelpers.go new file mode 100644 index 0000000..c892764 --- /dev/null +++ b/server/pdfhelpers.go @@ -0,0 +1,120 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "strings" +) + +func isScannedPDF(path string) (bool, error) { + content, _, err := extractTextFromPDF(path) + if err != nil { + return false, err + } + return len(strings.TrimSpace(string(content))) != 0, nil +} + +func runOCRMyPDF(inputPath string) error { + tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf") + defer os.Remove(tempfile.Name()) + if err != nil { + return err + } + cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name()) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output) + } + + processedData, err := os.ReadFile(tempfile.Name()) + if err != nil { + return fmt.Errorf("failed to read processed file: %v", err) + } + + err = os.WriteFile(inputPath, processedData, 0644) + if err != nil { + return fmt.Errorf("failed to overwrite input file: %v", err) + } + return nil +} + +func isEncrypted(pdfPath string) bool { + cmd := exec.Command("qpdf", "--check", pdfPath) + output, err := cmd.CombinedOutput() + if err != nil { + return strings.Contains(string(output), "File is not encrypted") + } + return false +} + +func repairPDF(inputPath string) error { + tempfile, err := os.CreateTemp("", "") + if err != nil { + return err + } + defer os.Remove(tempfile.Name()) + + cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name()) + output, err := cmd.CombinedOutput() + if err != nil { + return err + } + + _, err = tempfile.Write(output) + + return err +} + +func decryptPDF(inputPath string) error { + tempfile, err := os.CreateTemp("", "") + if err != nil { + return err + } + defer os.Remove(tempfile.Name()) + + cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name()) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output) + } + + processedData, err := os.ReadFile(tempfile.Name()) + if err != nil { + return fmt.Errorf("failed to read processed file: %v", err) + } + + err = os.WriteFile(inputPath, processedData, 0644) + if err != nil { + return fmt.Errorf("failed to overwrite input file: %v", err) + } + return nil +} + +func extractTextFromPDF(path string) (string, []byte, error) { + + tmpOut, err := os.CreateTemp("", "pdftotext-*.txt") + if err != nil { + return "", nil, fmt.Errorf("failed to create temp file: %w", err) + } + tmpOut.Close() + defer os.Remove(tmpOut.Name()) + + cmd := exec.Command("pdftotext", path, tmpOut.Name()) + output, err := cmd.CombinedOutput() + if err != nil { + return "", nil, fmt.Errorf("pdftotext failed: %v\nOutput: %s", err, output) + } + + data, err := os.ReadFile(tmpOut.Name()) + if err != nil { + return "", nil, fmt.Errorf("failed to read pdftotext output: %w", err) + } + + rawData, err := os.ReadFile(path) + if err != nil { + return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err) + } + + return string(data), rawData, nil +}