add small clean up method

This commit is contained in:
2025-04-23 08:06:03 +02:00
parent ba8a9023ea
commit 494d26c712
3 changed files with 63 additions and 3 deletions

View File

@@ -1,10 +1,12 @@
package main
import (
"bufio"
"fmt"
"os"
"os/exec"
"strings"
"unicode"
)
func isScannedPDF(path string) (bool, error) {
@@ -118,3 +120,56 @@ func extractTextFromPDF(path string) (string, []byte, error) {
return string(data), rawData, nil
}
func isUselessLine(line string) bool {
if len(line) == 0 {
return true
}
firstChar := rune(line[0])
if !unicode.IsLetter(firstChar) && !unicode.IsNumber(firstChar) {
allSame := true
for _, c := range line {
if c != firstChar {
allSame = false
break
}
}
if allSame && len(line) > 3 { // Minimum 4 repeating chars to consider useless
return true
}
}
return false
}
func cleanOCRText(input string) string {
var builder strings.Builder
builder.Grow(len(input))
scanner := bufio.NewScanner(strings.NewReader(input))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if isUselessLine(line) {
builder.WriteRune(' ')
continue
}
prevSpace := false
for _, r := range line {
if unicode.IsSpace(r) {
if !prevSpace {
builder.WriteRune(' ')
prevSpace = true
}
} else {
builder.WriteRune(r)
prevSpace = false
}
}
}
cleaned := strings.TrimSpace(builder.String())
return cleaned
}