mirror of
https://github.com/JuLi0n21/thumbnailservice.git
synced 2026-04-19 16:00:07 +00:00
fix ocr
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
client/testdata
|
client/testdata
|
||||||
client/thumbnail
|
client/thumbnail
|
||||||
|
client/ocr
|
||||||
@@ -29,21 +29,29 @@ func main() {
|
|||||||
}
|
}
|
||||||
client := pb.NewThumbnailServiceClient(conn)
|
client := pb.NewThumbnailServiceClient(conn)
|
||||||
filePath := []thingy{
|
filePath := []thingy{
|
||||||
{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
//{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
||||||
{pb.FileType_PDF, "testdata/pdf-sample.pdf"},
|
{pb.FileType_PDF, "testdata/pdf-sample.pdf"},
|
||||||
{pb.FileType_VIDEO, "testdata/video-sample.webm"}}
|
{pb.FileType_PDF, "testdata/blitzer.pdf"},
|
||||||
|
//{pb.FileType_VIDEO, "testdata/video-sample.webm"}
|
||||||
a := sync.WaitGroup{}
|
|
||||||
|
|
||||||
for _, f := range filePath {
|
|
||||||
a.Add(1)
|
|
||||||
go func() {
|
|
||||||
createPreview(f.Path, f.Type, client)
|
|
||||||
a.Done()
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
a.Wait()
|
wg := sync.WaitGroup{}
|
||||||
|
|
||||||
|
for _, f := range filePath {
|
||||||
|
wg.Add(2)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
createPreview(f.Path, f.Type, client)
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func(f thingy) {
|
||||||
|
defer wg.Done()
|
||||||
|
createOCR(f.Path, f.Type, client)
|
||||||
|
}(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServiceClient) {
|
func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServiceClient) {
|
||||||
@@ -78,6 +86,40 @@ func createPreview(filePath string, ftype pb.FileType, client pb.ThumbnailServic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func createOCR(filePath string, ftype pb.FileType, client pb.ThumbnailServiceClient) {
|
||||||
|
fileContent, err := os.ReadFile(filePath)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Error reading file: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
req := &pb.OCRFileRequest{
|
||||||
|
FileContent: fileContent,
|
||||||
|
FileType: ftype,
|
||||||
|
CleanUp: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 60000*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
resp, err := client.OcrFile(ctx, req)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Error calling OcrDocument: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("[OCR] %s: %s\n %s", filePath, resp.Message, resp.TextContent)
|
||||||
|
|
||||||
|
if len(resp.OcrContent) > 0 {
|
||||||
|
err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf")
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Error saving OCR text to file: %v", err)
|
||||||
|
} else {
|
||||||
|
fmt.Println("OCR text saved successfully.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Function to save the thumbnail content to a file in the 'thumbnail/' directory
|
// Function to save the thumbnail content to a file in the 'thumbnail/' directory
|
||||||
func saveThumbnailToFile(thumbnailContent []byte, filePath string) error {
|
func saveThumbnailToFile(thumbnailContent []byte, filePath string) error {
|
||||||
// Ensure the "thumbnail" directory exists
|
// Ensure the "thumbnail" directory exists
|
||||||
@@ -97,3 +139,21 @@ func saveThumbnailToFile(thumbnailContent []byte, filePath string) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func saveToFile(data []byte, originalPath, folder, ext string) error {
|
||||||
|
err := os.MkdirAll(folder, os.ModePerm)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create directory: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
baseName := filepath.Base(originalPath)
|
||||||
|
fileName := strings.TrimSuffix(baseName, filepath.Ext(baseName))
|
||||||
|
|
||||||
|
fullPath := filepath.Join(folder, fileName+ext)
|
||||||
|
err = os.WriteFile(fullPath, data, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to save file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -345,9 +345,10 @@ const file_thumbnail_proto_rawDesc = "" +
|
|||||||
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
||||||
"\x05IMAGE\x10\x01\x12\t\n" +
|
"\x05IMAGE\x10\x01\x12\t\n" +
|
||||||
"\x05VIDEO\x10\x02\x12\a\n" +
|
"\x05VIDEO\x10\x02\x12\a\n" +
|
||||||
"\x03PDF\x10\x032r\n" +
|
"\x03PDF\x10\x032\xc4\x01\n" +
|
||||||
"\x10ThumbnailService\x12^\n" +
|
"\x10ThumbnailService\x12^\n" +
|
||||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponseB\tZ\a./protob\x06proto3"
|
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponse\x12P\n" +
|
||||||
|
"\aOcrFile\x12!.thumbnail_service.OCRFileRequest\x1a\".thumbnail_service.OCRFileResponseB\tZ\a./protob\x06proto3"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
file_thumbnail_proto_rawDescOnce sync.Once
|
file_thumbnail_proto_rawDescOnce sync.Once
|
||||||
@@ -374,9 +375,11 @@ var file_thumbnail_proto_depIdxs = []int32{
|
|||||||
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
||||||
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
||||||
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
||||||
2, // 3: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
3, // 3: thumbnail_service.ThumbnailService.OcrFile:input_type -> thumbnail_service.OCRFileRequest
|
||||||
3, // [3:4] is the sub-list for method output_type
|
2, // 4: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||||
2, // [2:3] is the sub-list for method input_type
|
4, // 5: thumbnail_service.ThumbnailService.OcrFile:output_type -> thumbnail_service.OCRFileResponse
|
||||||
|
4, // [4:6] is the sub-list for method output_type
|
||||||
|
2, // [2:4] is the sub-list for method input_type
|
||||||
2, // [2:2] is the sub-list for extension type_name
|
2, // [2:2] is the sub-list for extension type_name
|
||||||
2, // [2:2] is the sub-list for extension extendee
|
2, // [2:2] is the sub-list for extension extendee
|
||||||
0, // [0:2] is the sub-list for field type_name
|
0, // [0:2] is the sub-list for field type_name
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ const _ = grpc.SupportPackageIsVersion9
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
||||||
|
ThumbnailService_OcrFile_FullMethodName = "/thumbnail_service.ThumbnailService/OcrFile"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
||||||
@@ -29,6 +30,7 @@ const (
|
|||||||
// Service definition
|
// Service definition
|
||||||
type ThumbnailServiceClient interface {
|
type ThumbnailServiceClient interface {
|
||||||
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
||||||
|
OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type thumbnailServiceClient struct {
|
type thumbnailServiceClient struct {
|
||||||
@@ -49,6 +51,16 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *thumbnailServiceClient) OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error) {
|
||||||
|
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||||
|
out := new(OCRFileResponse)
|
||||||
|
err := c.cc.Invoke(ctx, ThumbnailService_OcrFile_FullMethodName, in, out, cOpts...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
||||||
// All implementations must embed UnimplementedThumbnailServiceServer
|
// All implementations must embed UnimplementedThumbnailServiceServer
|
||||||
// for forward compatibility.
|
// for forward compatibility.
|
||||||
@@ -56,6 +68,7 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
|||||||
// Service definition
|
// Service definition
|
||||||
type ThumbnailServiceServer interface {
|
type ThumbnailServiceServer interface {
|
||||||
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
||||||
|
OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error)
|
||||||
mustEmbedUnimplementedThumbnailServiceServer()
|
mustEmbedUnimplementedThumbnailServiceServer()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,6 +82,9 @@ type UnimplementedThumbnailServiceServer struct{}
|
|||||||
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
||||||
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
||||||
}
|
}
|
||||||
|
func (UnimplementedThumbnailServiceServer) OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error) {
|
||||||
|
return nil, status.Errorf(codes.Unimplemented, "method OcrFile not implemented")
|
||||||
|
}
|
||||||
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
||||||
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
||||||
|
|
||||||
@@ -108,6 +124,24 @@ func _ThumbnailService_GenerateThumbnail_Handler(srv interface{}, ctx context.Co
|
|||||||
return interceptor(ctx, in, info, handler)
|
return interceptor(ctx, in, info, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func _ThumbnailService_OcrFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||||
|
in := new(OCRFileRequest)
|
||||||
|
if err := dec(in); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if interceptor == nil {
|
||||||
|
return srv.(ThumbnailServiceServer).OcrFile(ctx, in)
|
||||||
|
}
|
||||||
|
info := &grpc.UnaryServerInfo{
|
||||||
|
Server: srv,
|
||||||
|
FullMethod: ThumbnailService_OcrFile_FullMethodName,
|
||||||
|
}
|
||||||
|
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||||
|
return srv.(ThumbnailServiceServer).OcrFile(ctx, req.(*OCRFileRequest))
|
||||||
|
}
|
||||||
|
return interceptor(ctx, in, info, handler)
|
||||||
|
}
|
||||||
|
|
||||||
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
||||||
// It's only intended for direct use with grpc.RegisterService,
|
// It's only intended for direct use with grpc.RegisterService,
|
||||||
// and not to be introspected or modified (even as a copy)
|
// and not to be introspected or modified (even as a copy)
|
||||||
@@ -119,6 +153,10 @@ var ThumbnailService_ServiceDesc = grpc.ServiceDesc{
|
|||||||
MethodName: "GenerateThumbnail",
|
MethodName: "GenerateThumbnail",
|
||||||
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
MethodName: "OcrFile",
|
||||||
|
Handler: _ThumbnailService_OcrFile_Handler,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
Streams: []grpc.StreamDesc{},
|
Streams: []grpc.StreamDesc{},
|
||||||
Metadata: "thumbnail.proto",
|
Metadata: "thumbnail.proto",
|
||||||
|
|||||||
@@ -1,12 +1,15 @@
|
|||||||
FROM golang:1.24.1
|
FROM golang:1.24.2
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
imagemagick \
|
imagemagick \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
poppler-utils && \
|
poppler-utils \
|
||||||
rm -rf /var/lib/apt/lists/*
|
ocrmypdf \
|
||||||
|
qpdf \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ go 1.24.1
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/google/uuid v1.6.0
|
github.com/google/uuid v1.6.0
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
|
||||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
|
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
|
||||||
google.golang.org/grpc v1.71.0
|
google.golang.org/grpc v1.71.0
|
||||||
google.golang.org/protobuf v1.36.6
|
google.golang.org/protobuf v1.36.6
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
|||||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
|
||||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
|
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
|
||||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
||||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||||
|
|||||||
216
server/main.go
216
server/main.go
@@ -2,8 +2,10 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"image"
|
"image"
|
||||||
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
@@ -20,6 +22,7 @@ import (
|
|||||||
|
|
||||||
pb "github.com/JuLi0n21/thumbnail_service/proto"
|
pb "github.com/JuLi0n21/thumbnail_service/proto"
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/ledongthuc/pdf"
|
||||||
"github.com/nfnt/resize"
|
"github.com/nfnt/resize"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
)
|
)
|
||||||
@@ -170,6 +173,8 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
|
|||||||
if _, err := os.Stat(outputPath); err == nil {
|
if _, err := os.Stat(outputPath); err == nil {
|
||||||
os.Remove(outputPath)
|
os.Remove(outputPath)
|
||||||
}
|
}
|
||||||
|
end := time.Since(time.Now())
|
||||||
|
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
thumbnailContent, err := os.ReadFile(outputPath)
|
thumbnailContent, err := os.ReadFile(outputPath)
|
||||||
@@ -177,58 +182,229 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
|
|||||||
return nil, fmt.Errorf("failed to read generated thumbnail: %v", err)
|
return nil, fmt.Errorf("failed to read generated thumbnail: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
end := time.Since(time.Now())
|
|
||||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
|
||||||
return &pb.ThumbnailResponse{
|
return &pb.ThumbnailResponse{
|
||||||
Message: "Thumbnail generated successfully",
|
Message: "Thumbnail generated successfully",
|
||||||
ThumbnailContent: thumbnailContent,
|
ThumbnailContent: thumbnailContent,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *server) OCRFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFileResponse, error) {
|
func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFileResponse, error) {
|
||||||
|
start := time.Now()
|
||||||
|
fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType)
|
||||||
|
|
||||||
//save to disk
|
defer func() {
|
||||||
|
end := time.Since(time.Now())
|
||||||
|
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
||||||
|
}()
|
||||||
|
if req.FileType != pb.FileType_PDF {
|
||||||
|
err := errors.New("unsupported Filetype " + req.FileType.String())
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
file, err := os.CreateTemp("ocr", "temp-file-*")
|
||||||
|
if err != nil {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
filePath := file.Name()
|
||||||
|
defer func(file *os.File, filePath string) {
|
||||||
|
file.Close()
|
||||||
|
|
||||||
//do preprocessing
|
err = os.Remove(file.Name())
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err.Error())
|
||||||
|
}
|
||||||
|
}(file, filePath)
|
||||||
|
|
||||||
//extract information...
|
_, err = file.Write(req.FileContent)
|
||||||
|
if err != nil {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if ok, err := isScannedPDF(filePath); err != nil {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
} else if !ok {
|
||||||
|
if isEncrypted(filePath) {
|
||||||
|
err := decryptPDF(filePath)
|
||||||
|
if err != nil {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = runOCRMyPDF(filePath)
|
||||||
|
if err != nil {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var text string
|
||||||
|
var b []byte
|
||||||
|
text, b, err = extractTextFromPDF(filePath)
|
||||||
|
if err != nil {
|
||||||
|
|
||||||
|
if strings.Contains("malformed pdf", err.Error()) {
|
||||||
|
repairPDF(filePath)
|
||||||
|
text, b, err = extractTextFromPDF(filePath)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
return &pb.OCRFileResponse{
|
||||||
|
Message: "OCR failed, " + err.Error(),
|
||||||
|
TextContent: "",
|
||||||
|
OcrContent: []byte{},
|
||||||
|
}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//return stuff
|
|
||||||
return &pb.OCRFileResponse{
|
return &pb.OCRFileResponse{
|
||||||
Message: "OCRed successfully",
|
Message: "OCR success",
|
||||||
TextContent: "",
|
TextContent: text,
|
||||||
OcrContent: []byte{},
|
OcrContent: b,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isScannedPDF(filePath string) bool {
|
func isScannedPDF(path string) (bool, error) {
|
||||||
cmd := exec.Command("pdftotext", filePath, "-")
|
f, r, err := pdf.Open(path)
|
||||||
out, err := cmd.Output()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("pdftotext error for %s: %v", filePath, err)
|
return false, fmt.Errorf("failed to open PDF: %w", err)
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
return len(strings.TrimSpace(string(out))) == 0
|
defer f.Close()
|
||||||
|
|
||||||
|
reader, err := r.GetPlainText()
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("failed to get PDF text: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
content, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("failed to read PDF content: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return len(strings.TrimSpace(string(content))) != 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func runOCRMyPDF(inputPath, outputPath string) error {
|
func runOCRMyPDF(inputPath string) error {
|
||||||
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, outputPath)
|
tempfile, err := os.CreateTemp("", "temp-ocr-*.pdf")
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
cmd := exec.Command("ocrmypdf", "--skip-text", inputPath, tempfile.Name())
|
||||||
output, err := cmd.CombinedOutput()
|
output, err := cmd.CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
|
return fmt.Errorf("ocrmypdf failed: %v\nOutput: %s", err, output)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
processedData, err := os.ReadFile(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read processed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = os.WriteFile(inputPath, processedData, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func decryptPDF(inputPath, outputPath string) error {
|
func isEncrypted(pdfPath string) bool {
|
||||||
cmd := exec.Command("qpdf", "--decrypt", inputPath, outputPath)
|
cmd := exec.Command("qpdf", "--check", pdfPath)
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return strings.Contains(string(output), "File is not encrypted")
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func repairPDF(inputPath string) error {
|
||||||
|
tempfile, err := os.CreateTemp("", "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
|
||||||
|
cmd := exec.Command("qpdf", "--repair", inputPath, tempfile.Name())
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = tempfile.Write(output)
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func decryptPDF(inputPath string) error {
|
||||||
|
tempfile, err := os.CreateTemp("", "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer os.Remove(tempfile.Name())
|
||||||
|
|
||||||
|
cmd := exec.Command("qpdf", "--decrypt", inputPath, tempfile.Name())
|
||||||
output, err := cmd.CombinedOutput()
|
output, err := cmd.CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
|
return fmt.Errorf("qpdf failed: %v\nOutput: %s", err, output)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
processedData, err := os.ReadFile(tempfile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read processed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = os.WriteFile(inputPath, processedData, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to overwrite input file: %v", err)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func extractTextFromPDF(path string) (string, []byte, error) {
|
||||||
|
f, r, err := pdf.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", []byte{}, fmt.Errorf("failed to open PDF: %w", err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
reader, err := r.GetPlainText()
|
||||||
|
if err != nil {
|
||||||
|
return "", []byte{}, fmt.Errorf("failed to get PDF text: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
content, err := io.ReadAll(reader)
|
||||||
|
if err != nil {
|
||||||
|
return "", []byte{}, fmt.Errorf("failed to read PDF content: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rawData, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read raw PDF file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return string(content), rawData, nil
|
||||||
|
}
|
||||||
|
|
||||||
const maxMsgSize = 2147483648 // 2GB
|
const maxMsgSize = 2147483648 // 2GB
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|||||||
@@ -345,9 +345,10 @@ const file_thumbnail_proto_rawDesc = "" +
|
|||||||
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
"\x15FILE_TYPE_UNSPECIFIED\x10\x00\x12\t\n" +
|
||||||
"\x05IMAGE\x10\x01\x12\t\n" +
|
"\x05IMAGE\x10\x01\x12\t\n" +
|
||||||
"\x05VIDEO\x10\x02\x12\a\n" +
|
"\x05VIDEO\x10\x02\x12\a\n" +
|
||||||
"\x03PDF\x10\x032r\n" +
|
"\x03PDF\x10\x032\xc4\x01\n" +
|
||||||
"\x10ThumbnailService\x12^\n" +
|
"\x10ThumbnailService\x12^\n" +
|
||||||
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponseB\tZ\a./protob\x06proto3"
|
"\x11GenerateThumbnail\x12#.thumbnail_service.ThumbnailRequest\x1a$.thumbnail_service.ThumbnailResponse\x12P\n" +
|
||||||
|
"\aOcrFile\x12!.thumbnail_service.OCRFileRequest\x1a\".thumbnail_service.OCRFileResponseB\tZ\a./protob\x06proto3"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
file_thumbnail_proto_rawDescOnce sync.Once
|
file_thumbnail_proto_rawDescOnce sync.Once
|
||||||
@@ -374,9 +375,11 @@ var file_thumbnail_proto_depIdxs = []int32{
|
|||||||
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
0, // 0: thumbnail_service.ThumbnailRequest.file_type:type_name -> thumbnail_service.FileType
|
||||||
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
0, // 1: thumbnail_service.OCRFileRequest.file_type:type_name -> thumbnail_service.FileType
|
||||||
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
1, // 2: thumbnail_service.ThumbnailService.GenerateThumbnail:input_type -> thumbnail_service.ThumbnailRequest
|
||||||
2, // 3: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
3, // 3: thumbnail_service.ThumbnailService.OcrFile:input_type -> thumbnail_service.OCRFileRequest
|
||||||
3, // [3:4] is the sub-list for method output_type
|
2, // 4: thumbnail_service.ThumbnailService.GenerateThumbnail:output_type -> thumbnail_service.ThumbnailResponse
|
||||||
2, // [2:3] is the sub-list for method input_type
|
4, // 5: thumbnail_service.ThumbnailService.OcrFile:output_type -> thumbnail_service.OCRFileResponse
|
||||||
|
4, // [4:6] is the sub-list for method output_type
|
||||||
|
2, // [2:4] is the sub-list for method input_type
|
||||||
2, // [2:2] is the sub-list for extension type_name
|
2, // [2:2] is the sub-list for extension type_name
|
||||||
2, // [2:2] is the sub-list for extension extendee
|
2, // [2:2] is the sub-list for extension extendee
|
||||||
0, // [0:2] is the sub-list for field type_name
|
0, // [0:2] is the sub-list for field type_name
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ const _ = grpc.SupportPackageIsVersion9
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
ThumbnailService_GenerateThumbnail_FullMethodName = "/thumbnail_service.ThumbnailService/GenerateThumbnail"
|
||||||
|
ThumbnailService_OcrFile_FullMethodName = "/thumbnail_service.ThumbnailService/OcrFile"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
// ThumbnailServiceClient is the client API for ThumbnailService service.
|
||||||
@@ -29,6 +30,7 @@ const (
|
|||||||
// Service definition
|
// Service definition
|
||||||
type ThumbnailServiceClient interface {
|
type ThumbnailServiceClient interface {
|
||||||
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
GenerateThumbnail(ctx context.Context, in *ThumbnailRequest, opts ...grpc.CallOption) (*ThumbnailResponse, error)
|
||||||
|
OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type thumbnailServiceClient struct {
|
type thumbnailServiceClient struct {
|
||||||
@@ -49,6 +51,16 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *thumbnailServiceClient) OcrFile(ctx context.Context, in *OCRFileRequest, opts ...grpc.CallOption) (*OCRFileResponse, error) {
|
||||||
|
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||||
|
out := new(OCRFileResponse)
|
||||||
|
err := c.cc.Invoke(ctx, ThumbnailService_OcrFile_FullMethodName, in, out, cOpts...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
// ThumbnailServiceServer is the server API for ThumbnailService service.
|
||||||
// All implementations must embed UnimplementedThumbnailServiceServer
|
// All implementations must embed UnimplementedThumbnailServiceServer
|
||||||
// for forward compatibility.
|
// for forward compatibility.
|
||||||
@@ -56,6 +68,7 @@ func (c *thumbnailServiceClient) GenerateThumbnail(ctx context.Context, in *Thum
|
|||||||
// Service definition
|
// Service definition
|
||||||
type ThumbnailServiceServer interface {
|
type ThumbnailServiceServer interface {
|
||||||
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error)
|
||||||
|
OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error)
|
||||||
mustEmbedUnimplementedThumbnailServiceServer()
|
mustEmbedUnimplementedThumbnailServiceServer()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,6 +82,9 @@ type UnimplementedThumbnailServiceServer struct{}
|
|||||||
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
func (UnimplementedThumbnailServiceServer) GenerateThumbnail(context.Context, *ThumbnailRequest) (*ThumbnailResponse, error) {
|
||||||
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
return nil, status.Errorf(codes.Unimplemented, "method GenerateThumbnail not implemented")
|
||||||
}
|
}
|
||||||
|
func (UnimplementedThumbnailServiceServer) OcrFile(context.Context, *OCRFileRequest) (*OCRFileResponse, error) {
|
||||||
|
return nil, status.Errorf(codes.Unimplemented, "method OcrFile not implemented")
|
||||||
|
}
|
||||||
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
func (UnimplementedThumbnailServiceServer) mustEmbedUnimplementedThumbnailServiceServer() {}
|
||||||
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
func (UnimplementedThumbnailServiceServer) testEmbeddedByValue() {}
|
||||||
|
|
||||||
@@ -108,6 +124,24 @@ func _ThumbnailService_GenerateThumbnail_Handler(srv interface{}, ctx context.Co
|
|||||||
return interceptor(ctx, in, info, handler)
|
return interceptor(ctx, in, info, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func _ThumbnailService_OcrFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||||
|
in := new(OCRFileRequest)
|
||||||
|
if err := dec(in); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if interceptor == nil {
|
||||||
|
return srv.(ThumbnailServiceServer).OcrFile(ctx, in)
|
||||||
|
}
|
||||||
|
info := &grpc.UnaryServerInfo{
|
||||||
|
Server: srv,
|
||||||
|
FullMethod: ThumbnailService_OcrFile_FullMethodName,
|
||||||
|
}
|
||||||
|
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||||
|
return srv.(ThumbnailServiceServer).OcrFile(ctx, req.(*OCRFileRequest))
|
||||||
|
}
|
||||||
|
return interceptor(ctx, in, info, handler)
|
||||||
|
}
|
||||||
|
|
||||||
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
// ThumbnailService_ServiceDesc is the grpc.ServiceDesc for ThumbnailService service.
|
||||||
// It's only intended for direct use with grpc.RegisterService,
|
// It's only intended for direct use with grpc.RegisterService,
|
||||||
// and not to be introspected or modified (even as a copy)
|
// and not to be introspected or modified (even as a copy)
|
||||||
@@ -119,6 +153,10 @@ var ThumbnailService_ServiceDesc = grpc.ServiceDesc{
|
|||||||
MethodName: "GenerateThumbnail",
|
MethodName: "GenerateThumbnail",
|
||||||
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
Handler: _ThumbnailService_GenerateThumbnail_Handler,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
MethodName: "OcrFile",
|
||||||
|
Handler: _ThumbnailService_OcrFile_Handler,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
Streams: []grpc.StreamDesc{},
|
Streams: []grpc.StreamDesc{},
|
||||||
Metadata: "thumbnail.proto",
|
Metadata: "thumbnail.proto",
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ enum FileType {
|
|||||||
// Service definition
|
// Service definition
|
||||||
service ThumbnailService {
|
service ThumbnailService {
|
||||||
rpc GenerateThumbnail(ThumbnailRequest) returns (ThumbnailResponse);
|
rpc GenerateThumbnail(ThumbnailRequest) returns (ThumbnailResponse);
|
||||||
|
rpc OcrFile(OCRFileRequest) returns (OCRFileResponse);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Request message for generating thumbnails
|
// Request message for generating thumbnails
|
||||||
|
|||||||
Reference in New Issue
Block a user