diff --git a/client/main.go b/client/main.go index 6fdfd25..e607146 100644 --- a/client/main.go +++ b/client/main.go @@ -32,6 +32,7 @@ func main() { //{pb.FileType_IMAGE, "testdata/image-sample.png"}, {pb.FileType_PDF, "testdata/pdf-sample.pdf"}, {pb.FileType_PDF, "testdata/blitzer.pdf"}, + {pb.FileType_PDF, "testdata/persopin.pdf"}, //{pb.FileType_VIDEO, "testdata/video-sample.webm"} } @@ -105,7 +106,7 @@ func createOCR(filePath string, ftype pb.FileType, client pb.ThumbnailServiceCli return } - fmt.Printf("[OCR] %s: %s\n %s", filePath, resp.Message, resp.TextContent) + fmt.Printf("[OCR] %s: %s\n %s, %dkb\n", filePath, resp.Message, resp.TextContent, len(resp.TextContent)/1024.0) if len(resp.OcrContent) > 0 { err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf") diff --git a/server/main.go b/server/main.go index f0f196e..3fa9aae 100644 --- a/server/main.go +++ b/server/main.go @@ -171,7 +171,7 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest if _, err := os.Stat(outputPath); err == nil { os.Remove(outputPath) } - end := time.Since(time.Now()) + end := time.Since(start) fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth) }() @@ -191,7 +191,7 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType) defer func() { - end := time.Since(time.Now()) + end := time.Since(start) fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType) }() @@ -239,6 +239,10 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi return handleErr("failed to extract text", err) } + if req.CleanUp { + text = cleanOCRText(text) + } + return &pb.OCRFileResponse{ Message: "OCR success", TextContent: text, diff --git a/server/pdfhelpers.go b/server/pdfhelpers.go index c892764..21ee4bf 100644 --- a/server/pdfhelpers.go +++ b/server/pdfhelpers.go @@ -1,10 +1,12 @@ package main import ( + "bufio" "fmt" "os" "os/exec" "strings" + "unicode" ) func isScannedPDF(path string) (bool, error) { @@ -118,3 +120,56 @@ func extractTextFromPDF(path string) (string, []byte, error) { return string(data), rawData, nil } + +func isUselessLine(line string) bool { + if len(line) == 0 { + return true + } + + firstChar := rune(line[0]) + if !unicode.IsLetter(firstChar) && !unicode.IsNumber(firstChar) { + allSame := true + for _, c := range line { + if c != firstChar { + allSame = false + break + } + } + if allSame && len(line) > 3 { // Minimum 4 repeating chars to consider useless + return true + } + } + + return false +} + +func cleanOCRText(input string) string { + var builder strings.Builder + builder.Grow(len(input)) + + scanner := bufio.NewScanner(strings.NewReader(input)) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + if isUselessLine(line) { + builder.WriteRune(' ') + continue + } + + prevSpace := false + for _, r := range line { + if unicode.IsSpace(r) { + if !prevSpace { + builder.WriteRune(' ') + prevSpace = true + } + } else { + builder.WriteRune(r) + prevSpace = false + } + } + } + + cleaned := strings.TrimSpace(builder.String()) + return cleaned +}