mirror of
https://github.com/JuLi0n21/thumbnailservice.git
synced 2026-04-19 16:00:07 +00:00
add small clean up method
This commit is contained in:
@@ -32,6 +32,7 @@ func main() {
|
|||||||
//{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
//{pb.FileType_IMAGE, "testdata/image-sample.png"},
|
||||||
{pb.FileType_PDF, "testdata/pdf-sample.pdf"},
|
{pb.FileType_PDF, "testdata/pdf-sample.pdf"},
|
||||||
{pb.FileType_PDF, "testdata/blitzer.pdf"},
|
{pb.FileType_PDF, "testdata/blitzer.pdf"},
|
||||||
|
{pb.FileType_PDF, "testdata/persopin.pdf"},
|
||||||
//{pb.FileType_VIDEO, "testdata/video-sample.webm"}
|
//{pb.FileType_VIDEO, "testdata/video-sample.webm"}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -105,7 +106,7 @@ func createOCR(filePath string, ftype pb.FileType, client pb.ThumbnailServiceCli
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("[OCR] %s: %s\n %s", filePath, resp.Message, resp.TextContent)
|
fmt.Printf("[OCR] %s: %s\n %s, %dkb\n", filePath, resp.Message, resp.TextContent, len(resp.TextContent)/1024.0)
|
||||||
|
|
||||||
if len(resp.OcrContent) > 0 {
|
if len(resp.OcrContent) > 0 {
|
||||||
err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf")
|
err := saveToFile([]byte(resp.OcrContent), filePath, "ocr", ".pdf")
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ func (s *server) GenerateThumbnail(ctx context.Context, req *pb.ThumbnailRequest
|
|||||||
if _, err := os.Stat(outputPath); err == nil {
|
if _, err := os.Stat(outputPath); err == nil {
|
||||||
os.Remove(outputPath)
|
os.Remove(outputPath)
|
||||||
}
|
}
|
||||||
end := time.Since(time.Now())
|
end := time.Since(start)
|
||||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "Finshed in: ", end, req.FileType, "H: ", req.MaxHeight, "W: ", req.MaxWidth)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -191,7 +191,7 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
|
|||||||
fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType)
|
fmt.Println(start.Format("2006-01-02 15:04:05.000"), "OCR request ", req.FileType)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
end := time.Since(time.Now())
|
end := time.Since(start)
|
||||||
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
fmt.Println(time.Now().Format("2006-01-02 15:04:05.000"), "OCR Finshed in: ", end, req.FileType)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -239,6 +239,10 @@ func (s *server) OcrFile(ctx context.Context, req *pb.OCRFileRequest) (*pb.OCRFi
|
|||||||
return handleErr("failed to extract text", err)
|
return handleErr("failed to extract text", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if req.CleanUp {
|
||||||
|
text = cleanOCRText(text)
|
||||||
|
}
|
||||||
|
|
||||||
return &pb.OCRFileResponse{
|
return &pb.OCRFileResponse{
|
||||||
Message: "OCR success",
|
Message: "OCR success",
|
||||||
TextContent: text,
|
TextContent: text,
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
func isScannedPDF(path string) (bool, error) {
|
func isScannedPDF(path string) (bool, error) {
|
||||||
@@ -118,3 +120,56 @@ func extractTextFromPDF(path string) (string, []byte, error) {
|
|||||||
|
|
||||||
return string(data), rawData, nil
|
return string(data), rawData, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isUselessLine(line string) bool {
|
||||||
|
if len(line) == 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
firstChar := rune(line[0])
|
||||||
|
if !unicode.IsLetter(firstChar) && !unicode.IsNumber(firstChar) {
|
||||||
|
allSame := true
|
||||||
|
for _, c := range line {
|
||||||
|
if c != firstChar {
|
||||||
|
allSame = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allSame && len(line) > 3 { // Minimum 4 repeating chars to consider useless
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanOCRText(input string) string {
|
||||||
|
var builder strings.Builder
|
||||||
|
builder.Grow(len(input))
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(input))
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
|
||||||
|
if isUselessLine(line) {
|
||||||
|
builder.WriteRune(' ')
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
prevSpace := false
|
||||||
|
for _, r := range line {
|
||||||
|
if unicode.IsSpace(r) {
|
||||||
|
if !prevSpace {
|
||||||
|
builder.WriteRune(' ')
|
||||||
|
prevSpace = true
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
builder.WriteRune(r)
|
||||||
|
prevSpace = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cleaned := strings.TrimSpace(builder.String())
|
||||||
|
return cleaned
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user