mirror of
https://github.com/JuLi0n21/thumbnailservice.git
synced 2026-04-20 00:10:07 +00:00
remove pdf dependency and replace with pdftotext...
This commit is contained in:
@@ -66,10 +66,7 @@ func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServic
|
|||||||
MaxHeight: 150,
|
MaxHeight: 150,
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
|
resp, err := client.GenerateThumbnail(context.Background(), req)
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
resp, err := client.GenerateThumbnail(ctx, req)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Error calling GenerateThumbnail: %v", err)
|
log.Fatalf("Error calling GenerateThumbnail: %v", err)
|
||||||
}
|
}
|
||||||
|
|||||||
182
server/main.go
182
server/main.go
@@ -5,7 +5,6 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"image"
|
"image"
|
||||||
"io"
|
|
||||||
"log"
|
"log"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
@@ -22,7 +21,6 @@ import (
|
|||||||
|
|
||||||
pb "github.com/JuLi0n21/thumbnail_service/proto"
|
pb "github.com/JuLi0n21/thumbnail_service/proto"
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/ledongthuc/pdf"
|
|
||||||
"github.com/nfnt/resize"
|
"github.com/nfnt/resize"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
)
|
)
|
||||||
@@ -196,21 +194,14 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
|
|||||||
end := time.Since(time.Now())
|
end := time.Since(time.Now())
|
||||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if req.FileType != pb.FileType_PDF {
|
if req.FileType != pb.FileType_PDF {
|
||||||
err := errors.New("unsupported Filetype " + req.FileType.String())
|
err := errors.New("unsupported Filetype " + req.FileType.String())
|
||||||
return &pb.OCRFileResponse{
|
return handleErr(err.Error(), err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
}
|
||||||
file, err := os.CreateTemp("ocr", "temp-file-*")
|
file, err := os.CreateTemp("ocr", "temp-file-*")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &pb.OCRFileResponse{
|
return handleErr("failed to create temp file", err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
}
|
||||||
filePath := file.Name()
|
filePath := file.Name()
|
||||||
defer func(file *os.File, filePath string) {
|
defer func(file *os.File, filePath string) {
|
||||||
@@ -224,57 +215,28 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
|
|||||||
|
|
||||||
_, err = file.Write(req.FileContent)
|
_, err = file.Write(req.FileContent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &pb.OCRFileResponse{
|
return handleErr("failed to write file to temp file", err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ok, err := isScannedPDF(filePath); err != nil {
|
if ok, err := isScannedPDF(filePath); err != nil {
|
||||||
return &pb.OCRFileResponse{
|
return handleErr("failed to check if file is scanned", err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
} else if !ok {
|
} else if !ok {
|
||||||
if isEncrypted(filePath) {
|
if isEncrypted(filePath) {
|
||||||
err := decryptPDF(filePath)
|
err := decryptPDF(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &pb.OCRFileResponse{
|
return handleErr("failed to decrypt pdf", err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
err = runOCRMyPDF(filePath)
|
err = runOCRMyPDF(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &pb.OCRFileResponse{
|
return handleErr("failed to ocr pdf", err)
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var text string
|
text, b, err := extractTextFromPDF(filePath)
|
||||||
var b []byte
|
|
||||||
text, b, err = extractTextFromPDF(filePath)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
return handleErr("failed to extract text", err)
|
||||||
if strings.Contains("malformed pdf", err.Error()) {
|
|
||||||
repairPDF(filePath)
|
|
||||||
text, b, err = extractTextFromPDF(filePath)
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return &pb.OCRFileResponse{
|
|
||||||
Message: "OCR failed, " + err.Error(),
|
|
||||||
TextContent: "",
|
|
||||||
OcrContent: []byte{},
|
|
||||||
}, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &pb.OCRFileResponse{
|
return &pb.OCRFileResponse{
|
||||||
@@ -284,125 +246,13 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isScannedPDF(path string) (bool, error) {
|
func handleErr(message string, err error) (*pb.OCRFileResponse, error) {
|
||||||
f, r, err := pdf.Open(path)
|
fmt.Println(err)
|
||||||
if err != nil {
|
return &pb.OCRFileResponse{
|
||||||
return false, fmt.Errorf("failed to open PDF: %w", err)
|
Message: "OCR failed, " + message,
|
||||||
}
|
TextContent: "",
|
||||||
defer f.Close()
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
reader, err := r.GetPlainText()
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to get PDF text: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
content, err := io.ReadAll(reader)
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("failed to read PDF content: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return len(strings.TrimSpace(string(content))) != 0, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func runOCRMyPDF(inputPath string) error {
|
|
||||||
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
|
|
||||||
defer os.Remove(tempfile.Name())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
|
|
||||||
output, err := cmd.CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
|
|
||||||
}
|
|
||||||
|
|
||||||
processedData, err := os.ReadFile(tempfile.Name())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to read processed file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = os.WriteFile(inputPath, processedData, 0644)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to overwrite input file: %v", err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func isEncrypted(pdfPath string) bool {
|
|
||||||
cmd := exec.Command("qpdf", "--check", pdfPath)
|
|
||||||
output, err := cmd.CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return strings.Contains(string(output), "File is not encrypted")
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func repairPDF(inputPath string) error {
|
|
||||||
tempfile, err := os.CreateTemp("", "")
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer os.Remove(tempfile.Name())
|
|
||||||
|
|
||||||
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
|
|
||||||
output, err := cmd.CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = tempfile.Write(output)
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func decryptPDF(inputPath string) error {
|
|
||||||
tempfile, err := os.CreateTemp("", "")
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer os.Remove(tempfile.Name())
|
|
||||||
|
|
||||||
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
|
|
||||||
output, err := cmd.CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
|
|
||||||
}
|
|
||||||
|
|
||||||
processedData, err := os.ReadFile(tempfile.Name())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to read processed file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = os.WriteFile(inputPath, processedData, 0644)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to overwrite input file: %v", err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func extractTextFromPDF(path string) (string, []byte, error) {
|
|
||||||
f, r, err := pdf.Open(path)
|
|
||||||
if err != nil {
|
|
||||||
return "", []byte{}, fmt.Errorf("failed to open PDF: %w", err)
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
reader, err := r.GetPlainText()
|
|
||||||
if err != nil {
|
|
||||||
return "", []byte{}, fmt.Errorf("failed to get PDF text: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
content, err := io.ReadAll(reader)
|
|
||||||
if err != nil {
|
|
||||||
return "", []byte{}, fmt.Errorf("failed to read PDF content: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
rawData, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return string(content), rawData, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const maxMsgSize = 2147483648 // 2GB
|
const maxMsgSize = 2147483648 // 2GB
|
||||||
|
|||||||
120
server/pdfhelpers.go
Normal file
120
server/pdfhelpers.go
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func isScannedPDF(path string) (bool, error) {
|
||||||
|
content, _, err := extractTextFromPDF(path)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return len(strings.TrimSpace(string(content))) != 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func runOCRMyPDF(inputPath string) error {
|
||||||
|
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
|
||||||
|
}
|
||||||
|
|
||||||
|
processedData, err := os.ReadFile(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read processed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = os.WriteFile(inputPath, processedData, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isEncrypted(pdfPath string) bool {
|
||||||
|
cmd := exec.Command("qpdf", "--check", pdfPath)
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return strings.Contains(string(output), "File is not encrypted")
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func repairPDF(inputPath string) error {
|
||||||
|
tempfile, err := os.CreateTemp("", "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
|
||||||
|
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = tempfile.Write(output)
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func decryptPDF(inputPath string) error {
|
||||||
|
tempfile, err := os.CreateTemp("", "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
|
||||||
|
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
|
||||||
|
}
|
||||||
|
|
||||||
|
processedData, err := os.ReadFile(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read processed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = os.WriteFile(inputPath, processedData, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromPDF(path string) (string, []byte, error) {
|
||||||
|
|
||||||
|
tmpOut, err := os.CreateTemp("", "pdftotext-*.txt")
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to create temp file: %w", err)
|
||||||
|
}
|
||||||
|
tmpOut.Close()
|
||||||
|
defer os.Remove(tmpOut.Name())
|
||||||
|
|
||||||
|
cmd := exec.Command("pdftotext", path, tmpOut.Name())
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("pdftotext failed: %v\nOutput: %s", err, output)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := os.ReadFile(tmpOut.Name())
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read pdftotext output: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rawData, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return string(data), rawData, nil
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user