mirror of
https://github.com/JuLi0n21/thumbnailservice.git
synced 2026-04-19 16:00:07 +00:00
fix ocr
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
client/testdata
|
||||
client/thumbnail
|
||||
client/thumbnail
|
||||
client/ocr
|
||||
@@ -29,21 +29,29 @@ func main() {
|
||||
}
|
||||
client := pb.NewThumbnailServiceClient(conn)
|
||||
filePath := []thingy{
|
||||
{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
||||
//{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
||||
{pb.FileType_PDF, "testdata/pdf-sample.pdf"},
|
||||
{pb.FileType_VIDEO, "testdata/video-sample.webm"}}
|
||||
|
||||
a := sync.WaitGroup{}
|
||||
|
||||
for _, f := range filePath {
|
||||
a.Add(1)
|
||||
go func() {
|
||||
createPreview(f.Path, f.Type, client)
|
||||
a.Done()
|
||||
}()
|
||||
{pb.FileType_PDF, "testdata/blitzer.pdf"},
|
||||
//{pb.FileType_VIDEO, "testdata/video-sample.webm"}
|
||||
}
|
||||
|
||||
a.Wait()
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
for _, f := range filePath {
|
||||
wg.Add(2)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
createPreview(f.Path, f.Type, client)
|
||||
}()
|
||||
|
||||
go func(f thingy) {
|
||||
defer wg.Done()
|
||||
createOCR(f.Path, f.Type, client)
|
||||
}(f)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
}
|
||||
|
||||
func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServiceClient) {
|
||||
@@ -78,6 +86,40 @@ func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServic
|
||||
}
|
||||
}
|
||||
|
||||
func createOCR(filePath string, ftype pb.FileType, client pb.ThumbnailServiceClient) {
|
||||
fileContent, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
log.Printf("Error reading file: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
req := &pb.OCRFileRequest{
|
||||
FileContent: fileContent,
|
||||
FileType: ftype,
|
||||
CleanUp: true,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60000*time.Second)
|
||||
defer cancel()
|
||||
|
||||
resp, err := client.OcrFile(ctx, req)
|
||||
if err != nil {
|
||||
log.Printf("Error calling OcrDocument: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("[OCR] %s: %s\n %s", filePath, resp.Message, resp.TextContent)
|
||||
|
||||
if len(resp.OcrContent) > 0 {
|
||||
err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf")
|
||||
if err != nil {
|
||||
log.Printf("Error saving OCR text to file: %v", err)
|
||||
} else {
|
||||
fmt.Println("OCR text saved successfully.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Function to save the thumbnail content to a file in the 'thumbnail/' directory
|
||||
func saveThumbnailToFile(thumbnailContent []byte, filePath string) error {
|
||||
// Ensure the "thumbnail" directory exists
|
||||
@@ -97,3 +139,21 @@ func saveThumbnailToFile(thumbnailContent []byte, filePath string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func saveToFile(data []byte, originalPath, folder, ext string) error {
|
||||
err := os.MkdirAll(folder, os.ModePerm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create directory: %v", err)
|
||||
}
|
||||
|
||||
baseName := filepath.Base(originalPath)
|
||||
fileName := strings.TrimSuffix(baseName, filepath.Ext(baseName))
|
||||
|
||||
fullPath := filepath.Join(folder, fileName+ext)
|
||||
err = os.WriteFile(fullPath, data, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to save file: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -345,9 +345,10 @@ const file_thumbnail_proto_rawDesc = "" +
|
||||
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
||||
"\x05IMAGE\x10\x01\x12\t\n" +
|
||||
"\x05VIDEO\x10\x02\x12\a\n" +
|
||||
"\x03PDF\x10\x032r\n" +
|
||||
"\x03PDF\x10\x032\xc4\x01\n" +
|
||||
"\x10ThumbnailService\x12^\n" +
|
||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponseB\tZ\a./protob\x06proto3"
|
||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponse\x12P\n" +
|
||||
"\aOcrFile\x12!.thumbnail_service.OCRFileRequest\x1a\".thumbnail_service.OCRFileResponseB\tZ\a./protob\x06proto3"
|
||||
|
||||
var (
|
||||
file_thumbnail_proto_rawDescOnce sync.Once
|
||||
@@ -374,9 +375,11 @@ var file_thumbnail_proto_depIdxs = []int32{
|
||||
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
||||
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
||||
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
||||
2, // 3: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||
3, // [3:4] is the sub-list for method output_type
|
||||
2, // [2:3] is the sub-list for method input_type
|
||||
3, // 3: thumbnail_service.ThumbnailService.OcrFile:input_type -> thumbnail_service.OCRFileRequest
|
||||
2, // 4: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||
4, // 5: thumbnail_service.ThumbnailService.OcrFile:output_type -> thumbnail_service.OCRFileResponse
|
||||
4, // [4:6] is the sub-list for method output_type
|
||||
2, // [2:4] is the sub-list for method input_type
|
||||
2, // [2:2] is the sub-list for extension type_name
|
||||
2, // [2:2] is the sub-list for extension extendee
|
||||
0, // [0:2] is the sub-list for field type_name
|
||||
|
||||
@@ -20,6 +20,7 @@ const _ = grpc.SupportPackageIsVersion9
|
||||
|
||||
const (
|
||||
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
||||
ThumbnailService_OcrFile_FullMethodName = "/thumbnail_service.ThumbnailService/OcrFile"
|
||||
)
|
||||
|
||||
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
||||
@@ -29,6 +30,7 @@ const (
|
||||
// Service definition
|
||||
type ThumbnailServiceClient interface {
|
||||
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
||||
OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error)
|
||||
}
|
||||
|
||||
type thumbnailServiceClient struct {
|
||||
@@ -49,6 +51,16 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *thumbnailServiceClient) OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error) {
|
||||
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||
out := new(OCRFileResponse)
|
||||
err := c.cc.Invoke(ctx, ThumbnailService_OcrFile_FullMethodName, in, out, cOpts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
||||
// All implementations must embed UnimplementedThumbnailServiceServer
|
||||
// for forward compatibility.
|
||||
@@ -56,6 +68,7 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
||||
// Service definition
|
||||
type ThumbnailServiceServer interface {
|
||||
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
||||
OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error)
|
||||
mustEmbedUnimplementedThumbnailServiceServer()
|
||||
}
|
||||
|
||||
@@ -69,6 +82,9 @@ type UnimplementedThumbnailServiceServer struct{}
|
||||
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
||||
}
|
||||
func (UnimplementedThumbnailServiceServer) OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method OcrFile not implemented")
|
||||
}
|
||||
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
||||
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
||||
|
||||
@@ -108,6 +124,24 @@ func _ThumbnailService_GenerateThumbnail_Handler(srv interface{}, ctx context.Co
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
func _ThumbnailService_OcrFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(OCRFileRequest)
|
||||
if err := dec(in); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if interceptor == nil {
|
||||
return srv.(ThumbnailServiceServer).OcrFile(ctx, in)
|
||||
}
|
||||
info := &grpc.UnaryServerInfo{
|
||||
Server: srv,
|
||||
FullMethod: ThumbnailService_OcrFile_FullMethodName,
|
||||
}
|
||||
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||
return srv.(ThumbnailServiceServer).OcrFile(ctx, req.(*OCRFileRequest))
|
||||
}
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
||||
// It's only intended for direct use with grpc.RegisterService,
|
||||
// and not to be introspected or modified (even as a copy)
|
||||
@@ -119,6 +153,10 @@ var ThumbnailService_ServiceDesc = grpc.ServiceDesc{
|
||||
MethodName: "GenerateThumbnail",
|
||||
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
||||
},
|
||||
{
|
||||
MethodName: "OcrFile",
|
||||
Handler: _ThumbnailService_OcrFile_Handler,
|
||||
},
|
||||
},
|
||||
Streams: []grpc.StreamDesc{},
|
||||
Metadata: "thumbnail.proto",
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
FROM golang:1.24.1
|
||||
FROM golang:1.24.2
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
imagemagick \
|
||||
ffmpeg \
|
||||
poppler-utils && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
poppler-utils \
|
||||
ocrmypdf \
|
||||
qpdf \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY . .
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ go 1.24.1
|
||||
|
||||
require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
|
||||
google.golang.org/grpc v1.71.0
|
||||
google.golang.org/protobuf v1.36.6
|
||||
|
||||
@@ -8,6 +8,8 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
|
||||
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
|
||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||
|
||||
216
server/main.go
216
server/main.go
@@ -2,8 +2,10 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"image"
|
||||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
@@ -20,6 +22,7 @@ import (
|
||||
|
||||
pb "github.com/JuLi0n21/thumbnail_service/proto"
|
||||
"github.com/google/uuid"
|
||||
"github.com/ledongthuc/pdf"
|
||||
"github.com/nfnt/resize"
|
||||
"google.golang.org/grpc"
|
||||
)
|
||||
@@ -170,6 +173,8 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
|
||||
if _, err := os.Stat(outputPath); err == nil {
|
||||
os.Remove(outputPath)
|
||||
}
|
||||
end := time.Since(time.Now())
|
||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
||||
}()
|
||||
|
||||
thumbnailContent, err := os.ReadFile(outputPath)
|
||||
@@ -177,58 +182,229 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
|
||||
return nil, fmt.Errorf("failed to read generated thumbnail: %v", err)
|
||||
}
|
||||
|
||||
end := time.Since(time.Now())
|
||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
||||
return &pb.ThumbnailResponse{
|
||||
Message: "Thumbnail generated successfully",
|
||||
ThumbnailContent: thumbnailContent,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *server) OCRFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFileResponse, error) {
|
||||
func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFileResponse, error) {
|
||||
start := time.Now()
|
||||
fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType)
|
||||
|
||||
//save to disk
|
||||
defer func() {
|
||||
end := time.Since(time.Now())
|
||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
||||
}()
|
||||
if req.FileType != pb.FileType_PDF {
|
||||
err := errors.New("unsupported Filetype " + req.FileType.String())
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
file, err := os.CreateTemp("ocr", "temp-file-*")
|
||||
if err != nil {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
filePath := file.Name()
|
||||
defer func(file *os.File, filePath string) {
|
||||
file.Close()
|
||||
|
||||
//do preprocessing
|
||||
err = os.Remove(file.Name())
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
}
|
||||
}(file, filePath)
|
||||
|
||||
//extract information...
|
||||
_, err = file.Write(req.FileContent)
|
||||
if err != nil {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
|
||||
if ok, err := isScannedPDF(filePath); err != nil {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
} else if !ok {
|
||||
if isEncrypted(filePath) {
|
||||
err := decryptPDF(filePath)
|
||||
if err != nil {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
}
|
||||
|
||||
err = runOCRMyPDF(filePath)
|
||||
if err != nil {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
}
|
||||
|
||||
var text string
|
||||
var b []byte
|
||||
text, b, err = extractTextFromPDF(filePath)
|
||||
if err != nil {
|
||||
|
||||
if strings.Contains("malformed pdf", err.Error()) {
|
||||
repairPDF(filePath)
|
||||
text, b, err = extractTextFromPDF(filePath)
|
||||
|
||||
} else {
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCR failed, " + err.Error(),
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
}, err
|
||||
}
|
||||
}
|
||||
|
||||
//return stuff
|
||||
return &pb.OCRFileResponse{
|
||||
Message: "OCRed successfully",
|
||||
TextContent: "",
|
||||
OcrContent: []byte{},
|
||||
Message: "OCR success",
|
||||
TextContent: text,
|
||||
OcrContent: b,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func isScannedPDF(filePath string) bool {
|
||||
cmd := exec.Command("pdftotext", filePath, "-")
|
||||
out, err := cmd.Output()
|
||||
func isScannedPDF(path string) (bool, error) {
|
||||
f, r, err := pdf.Open(path)
|
||||
if err != nil {
|
||||
log.Printf("pdftotext error for %s: %v", filePath, err)
|
||||
return false
|
||||
return false, fmt.Errorf("failed to open PDF: %w", err)
|
||||
}
|
||||
return len(strings.TrimSpace(string(out))) == 0
|
||||
defer f.Close()
|
||||
|
||||
reader, err := r.GetPlainText()
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("failed to get PDF text: %w", err)
|
||||
}
|
||||
|
||||
content, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("failed to read PDF content: %w", err)
|
||||
}
|
||||
|
||||
return len(strings.TrimSpace(string(content))) != 0, nil
|
||||
}
|
||||
|
||||
func runOCRMyPDF(inputPath, outputPath string) error {
|
||||
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, outputPath)
|
||||
func runOCRMyPDF(inputPath string) error {
|
||||
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
|
||||
defer os.Remove(tempfile.Name())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
|
||||
}
|
||||
|
||||
processedData, err := os.ReadFile(tempfile.Name())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read processed file: %v", err)
|
||||
}
|
||||
|
||||
err = os.WriteFile(inputPath, processedData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func decryptPDF(inputPath, outputPath string) error {
|
||||
cmd := exec.Command("qpdf", "--decrypt", inputPath, outputPath)
|
||||
func isEncrypted(pdfPath string) bool {
|
||||
cmd := exec.Command("qpdf", "--check", pdfPath)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return strings.Contains(string(output), "File is not encrypted")
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func repairPDF(inputPath string) error {
|
||||
tempfile, err := os.CreateTemp("", "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.Remove(tempfile.Name())
|
||||
|
||||
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = tempfile.Write(output)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func decryptPDF(inputPath string) error {
|
||||
tempfile, err := os.CreateTemp("", "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.Remove(tempfile.Name())
|
||||
|
||||
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
|
||||
}
|
||||
|
||||
processedData, err := os.ReadFile(tempfile.Name())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read processed file: %v", err)
|
||||
}
|
||||
|
||||
err = os.WriteFile(inputPath, processedData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractTextFromPDF(path string) (string, []byte, error) {
|
||||
f, r, err := pdf.Open(path)
|
||||
if err != nil {
|
||||
return "", []byte{}, fmt.Errorf("failed to open PDF: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
reader, err := r.GetPlainText()
|
||||
if err != nil {
|
||||
return "", []byte{}, fmt.Errorf("failed to get PDF text: %w", err)
|
||||
}
|
||||
|
||||
content, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return "", []byte{}, fmt.Errorf("failed to read PDF content: %w", err)
|
||||
}
|
||||
|
||||
rawData, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
|
||||
}
|
||||
|
||||
return string(content), rawData, nil
|
||||
}
|
||||
|
||||
const maxMsgSize = 2147483648 // 2GB
|
||||
|
||||
func main() {
|
||||
|
||||
@@ -345,9 +345,10 @@ const file_thumbnail_proto_rawDesc = "" +
|
||||
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
||||
"\x05IMAGE\x10\x01\x12\t\n" +
|
||||
"\x05VIDEO\x10\x02\x12\a\n" +
|
||||
"\x03PDF\x10\x032r\n" +
|
||||
"\x03PDF\x10\x032\xc4\x01\n" +
|
||||
"\x10ThumbnailService\x12^\n" +
|
||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponseB\tZ\a./protob\x06proto3"
|
||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponse\x12P\n" +
|
||||
"\aOcrFile\x12!.thumbnail_service.OCRFileRequest\x1a\".thumbnail_service.OCRFileResponseB\tZ\a./protob\x06proto3"
|
||||
|
||||
var (
|
||||
file_thumbnail_proto_rawDescOnce sync.Once
|
||||
@@ -374,9 +375,11 @@ var file_thumbnail_proto_depIdxs = []int32{
|
||||
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
||||
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
||||
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
||||
2, // 3: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||
3, // [3:4] is the sub-list for method output_type
|
||||
2, // [2:3] is the sub-list for method input_type
|
||||
3, // 3: thumbnail_service.ThumbnailService.OcrFile:input_type -> thumbnail_service.OCRFileRequest
|
||||
2, // 4: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||
4, // 5: thumbnail_service.ThumbnailService.OcrFile:output_type -> thumbnail_service.OCRFileResponse
|
||||
4, // [4:6] is the sub-list for method output_type
|
||||
2, // [2:4] is the sub-list for method input_type
|
||||
2, // [2:2] is the sub-list for extension type_name
|
||||
2, // [2:2] is the sub-list for extension extendee
|
||||
0, // [0:2] is the sub-list for field type_name
|
||||
|
||||
@@ -20,6 +20,7 @@ const _ = grpc.SupportPackageIsVersion9
|
||||
|
||||
const (
|
||||
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
||||
ThumbnailService_OcrFile_FullMethodName = "/thumbnail_service.ThumbnailService/OcrFile"
|
||||
)
|
||||
|
||||
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
||||
@@ -29,6 +30,7 @@ const (
|
||||
// Service definition
|
||||
type ThumbnailServiceClient interface {
|
||||
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
||||
OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error)
|
||||
}
|
||||
|
||||
type thumbnailServiceClient struct {
|
||||
@@ -49,6 +51,16 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *thumbnailServiceClient) OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error) {
|
||||
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||
out := new(OCRFileResponse)
|
||||
err := c.cc.Invoke(ctx, ThumbnailService_OcrFile_FullMethodName, in, out, cOpts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
||||
// All implementations must embed UnimplementedThumbnailServiceServer
|
||||
// for forward compatibility.
|
||||
@@ -56,6 +68,7 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
||||
// Service definition
|
||||
type ThumbnailServiceServer interface {
|
||||
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
||||
OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error)
|
||||
mustEmbedUnimplementedThumbnailServiceServer()
|
||||
}
|
||||
|
||||
@@ -69,6 +82,9 @@ type UnimplementedThumbnailServiceServer struct{}
|
||||
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
||||
}
|
||||
func (UnimplementedThumbnailServiceServer) OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method OcrFile not implemented")
|
||||
}
|
||||
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
||||
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
||||
|
||||
@@ -108,6 +124,24 @@ func _ThumbnailService_GenerateThumbnail_Handler(srv interface{}, ctx context.Co
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
func _ThumbnailService_OcrFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(OCRFileRequest)
|
||||
if err := dec(in); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if interceptor == nil {
|
||||
return srv.(ThumbnailServiceServer).OcrFile(ctx, in)
|
||||
}
|
||||
info := &grpc.UnaryServerInfo{
|
||||
Server: srv,
|
||||
FullMethod: ThumbnailService_OcrFile_FullMethodName,
|
||||
}
|
||||
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||
return srv.(ThumbnailServiceServer).OcrFile(ctx, req.(*OCRFileRequest))
|
||||
}
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
||||
// It's only intended for direct use with grpc.RegisterService,
|
||||
// and not to be introspected or modified (even as a copy)
|
||||
@@ -119,6 +153,10 @@ var ThumbnailService_ServiceDesc = grpc.ServiceDesc{
|
||||
MethodName: "GenerateThumbnail",
|
||||
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
||||
},
|
||||
{
|
||||
MethodName: "OcrFile",
|
||||
Handler: _ThumbnailService_OcrFile_Handler,
|
||||
},
|
||||
},
|
||||
Streams: []grpc.StreamDesc{},
|
||||
Metadata: "thumbnail.proto",
|
||||
|
||||
@@ -15,6 +15,7 @@ enum FileType {
|
||||
// Service definition
|
||||
service ThumbnailService {
|
||||
rpc GenerateThumbnail(ThumbnailRequest) returns (ThumbnailResponse);
|
||||
rpc OcrFile(OCRFileRequest) returns (OCRFileResponse);
|
||||
}
|
||||
|
||||
// Request message for generating thumbnails
|
||||
|
||||
Reference in New Issue
Block a user