add small clean up method

This commit is contained in:
2025-04-23 08:06:03 +02:00
parent ba8a9023ea
commit 494d26c712
3 changed files with 63 additions and 3 deletions

View File

@@ -32,6 +32,7 @@ func main() {
//{pb.FileType_IMAGE, "testdata/image-sample.png"}, //{pb.FileType_IMAGE, "testdata/image-sample.png"},
{pb.FileType_PDF, "testdata/pdf-sample.pdf"}, {pb.FileType_PDF, "testdata/pdf-sample.pdf"},
{pb.FileType_PDF, "testdata/blitzer.pdf"}, {pb.FileType_PDF, "testdata/blitzer.pdf"},
{pb.FileType_PDF, "testdata/persopin.pdf"},
//{pb.FileType_VIDEO, "testdata/video-sample.webm"} //{pb.FileType_VIDEO, "testdata/video-sample.webm"}
} }
@@ -105,7 +106,7 @@ func createOCR(filePath string, ftype pb.FileType, client pb.ThumbnailServiceCli
return return
} }
fmt.Printf("[OCR] %s: %s\n %s", filePath, resp.Message, resp.TextContent) fmt.Printf("[OCR] %s: %s\n %s, %dkb\n", filePath, resp.Message, resp.TextContent, len(resp.TextContent)/1024.0)
if len(resp.OcrContent) > 0 { if len(resp.OcrContent) > 0 {
err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf") err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf")

View File

@@ -171,7 +171,7 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
if _, err := os.Stat(outputPath); err == nil { if _, err := os.Stat(outputPath); err == nil {
os.Remove(outputPath) os.Remove(outputPath)
} }
end := time.Since(time.Now()) end := time.Since(start)
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth) fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
}() }()
@@ -191,7 +191,7 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType) fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType)
defer func() { defer func() {
end := time.Since(time.Now()) end := time.Since(start)
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType) fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
}() }()
@@ -239,6 +239,10 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
return handleErr("failed to extract text", err) return handleErr("failed to extract text", err)
} }
if req.CleanUp {
text = cleanOCRText(text)
}
return &pb.OCRFileResponse{ return &pb.OCRFileResponse{
Message: "OCR success", Message: "OCR success",
TextContent: text, TextContent: text,

View File

@@ -1,10 +1,12 @@
package main package main
import ( import (
"bufio"
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"strings" "strings"
"unicode"
) )
func isScannedPDF(path string) (bool, error) { func isScannedPDF(path string) (bool, error) {
@@ -118,3 +120,56 @@ func extractTextFromPDF(path string) (string, []byte, error) {
return string(data), rawData, nil return string(data), rawData, nil
} }
func isUselessLine(line string) bool {
if len(line) == 0 {
return true
}
firstChar := rune(line[0])
if !unicode.IsLetter(firstChar) && !unicode.IsNumber(firstChar) {
allSame := true
for _, c := range line {
if c != firstChar {
allSame = false
break
}
}
if allSame && len(line) > 3 { // Minimum 4 repeating chars to consider useless
return true
}
}
return false
}
func cleanOCRText(input string) string {
var builder strings.Builder
builder.Grow(len(input))
scanner := bufio.NewScanner(strings.NewReader(input))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isUselessLine(line) {
builder.WriteRune(' ')
continue
}
prevSpace := false
for _, r := range line {
if unicode.IsSpace(r) {
if !prevSpace {
builder.WriteRune(' ')
prevSpace = true
}
} else {
builder.WriteRune(r)
prevSpace = false
}
}
}
cleaned := strings.TrimSpace(builder.String())
return cleaned
}