小言_互联网的博客

【pdf翻译中文】免费自用日语pdf转中文、韩语pdf转中文(也可以转英文)或者其他小语种法语、德语pdf转中文

439人阅读  评论(0)

不是很精确、有点慢,但是也够用,胜在免费free

效果图:

一些对比:

模型来自于:

https://huggingface.co/models

文件目录


调用模型的代码:

运行此段代码,执行翻译

一些简单的设置在这里控制
pdf2chines.py

import os

import cv2
import easyocr
from PIL import Image
from PIL import Image, ImageDraw, ImageFont

import rect_dealer
from img_text import ImgText

cut_model_path = r"F:\ocr\cut_model"
detect_model_jap_path = r"F:\ocr\meta_model\manga-ocr-base"  # 检测漫画的文本用的,好烂,还不如easyocr
trans_model_path = r"F:\ocr\meta_model\m2m100_1.2B"  # meta的模型
pdf2png_save_path = r"F:\ocr\pdf2png"
pdf_path = r"F:\ocr\pdfs"
pass_point = 0.05
blank_png_path = r"F:\ocr\blank.png"
DEFUALT_FONT_SIZE = 60
MIN_FONT_SIZE = 20

height_sub = 0.1  # 检测到位置后,高度减少一丢丢来找每个文本块
include_height_sub = 0.3
include_width_sub = 0.3
finished_list = "finished_list.txt"


def generate_mask(png, graph_infos):
    """
    生成一张mask图
    :param png:
    :param graph_infos:
    :return:
    """
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in graph_infos:
        pos_info = info[0]
        left_up_point = pos_info[0]  # [939, 791]
        left_down_point = pos_info[3]  # [939, 805]
        right_up_point = pos_info[1]  # [1007, 791]
        right_down_point = pos_info[2]  # [1007, 805]
        up_margin = left_up_point[0]  # 上间距
        left_margin = left_up_point[1]  # 左侧间距
        width = right_up_point[0] - left_up_point[0]
        height = right_down_point[1] - right_up_point[1]
        height_sub_num = height * 0.1
        blank_png = Image.new('RGBA', (int(width), int(height - height_sub_num)), (255, 255, 255))
        image.paste(blank_png, (int(up_margin + height_sub_num), int(left_margin)))
    image.save("{}_filled.png".format(png))
    return "{}_filled.png".format(png)


def merge_neighbers(png, graph_infos):
    """
    需要把邻近的行都合并了
    :param graph_infos:
    :return:
    """
    filled_path = generate_mask(png, graph_infos)  # 生成mask图
    rects = rect_dealer.getHoleRects(filled_path)  # 获取分割关系
    for info in graph_infos:
        # 检测包含关系
        detect_include(rects, info[0], info[-2], info[-1])
    return rects


def detect_include(rects, pos_info, words, acc):
    left_up_point = pos_info[0]  # [939, 791]
    left_down_point = pos_info[3]  # [939, 805]
    right_up_point = pos_info[1]  # [1007, 791]
    right_down_point = pos_info[2]  # [1007, 805]
    up_margin = left_up_point[0]  # 上间距
    left_margin = left_up_point[1]  # 左侧间距
    width = right_up_point[0] - left_up_point[0]
    height = right_down_point[1] - right_up_point[1]
    height_sub_num = min(height * include_height_sub, 20)
    width_sub_num = min(include_width_sub * width, 10)
    for rect in rects:
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h:
            print("minus:{},{}".format(rect, pos_info))
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h and left_up_point[0] > \
                rect.x - width_sub_num and left_up_point[1] > rect.y - height_sub_num:
            rect.words += words
            rect.acc += float(acc)
            rect.acc /= 2.0
            rect.line_num += 1  # 行数+1
            # print("include:{},{}".format(rect, pos_info))
            return


def change_graph2words(graph_path, languages):
    """
    图片转成词
    :param graph_path:
    :param languages:
    :return:
    """
    reader = easyocr.Reader(languages, model_storage_directory=cut_model_path, download_enabled=False, gpu=True)
    result = reader.readtext(graph_path)
    return result


def words2chinese(words, from_lang, tgt_lang):
    from transformers import pipeline
    translator = pipeline("translation", model=trans_model_path)
    to_trans = "".join(words)
    output = translator(to_trans, src_lang=from_lang, tgt_lang=tgt_lang)
    print("翻译原文:{}\n翻译结果:{}".format(to_trans, output))
    return output


def pdf2png(pdf_name):
    import fitz
    #  打开PDF文件,生成一个对象
    doc = fitz.open('{}'.format(pdf_name))
    png_paths = []
    for pg in range(doc.page_count):
        page = doc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
        zoom_x = 1.0
        zoom_y = 1.0
        trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
        pm = page.get_pixmap(matrix=trans, alpha=False)
        graph_path = os.path.join(pdf2png_save_path, '%s.png' % pg)
        pm.save(graph_path, output="png")
        png_paths.append(graph_path)
    return png_paths


def line_sep(sentense, line_num):
    sep = int(len(sentense) / line_num)
    new_sen = ""
    next_start_index = 0
    for i in range(0, line_num):
        new_sen += sentense[next_start_index:line_num + sep].strip()
        new_sen += "\n"
        next_start_index = line_num + sep
    new_sen += sentense[next_start_index:]
    return new_sen


def draw_text(png, infos):
    image = Image.open(png)
    for info in infos:
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
        n = ImgText(info.words, max(min(int(min(info.w, info.h) / (1.5 * info.line_num)), DEFUALT_FONT_SIZE)
                                    , MIN_FONT_SIZE), info.w)
        n.draw_text(image, info.x, info.y)
    image.save("{}".format(png))


def clear_png_files():
    pass


def translate_a_pdf(pdf_path, detectlang: list, translate_from_lang: str, translate_to_lang: str):
    with open(finished_list, "r") as f:
        finished = f.readlines()
    for finish in finished:
        finished[finished.index(finish)] = finish.strip()
    clear_png_files()  # 先清空png文件夹下面的全部图,然后就可以转换当前pdf的图了
    png_paths = pdf2png(pdf_path)

    for png in png_paths:
        if png in finished:
            continue
        img_changes = []
        words_result = change_graph2words(png, detectlang)
        print("查找到的文本:{}".format(words_result))
        rects = merge_neighbers(png, words_result)  # 合并段
        for rect in rects:
            if float(rect.acc) < pass_point:
                print("认为这个词正确度{}极低,不进行翻译:{}".format(rect.acc, rect.words))
                continue
            transed_words = words2chinese(rect.words, translate_from_lang, translate_to_lang)
            translation_text = ""
            for trans in transed_words:
                translation_text += trans["translation_text"]
            rect.words = translation_text
            print("存储位置:{}".format(str(rect)))
            img_changes.append(rect)  # 更新一下图像数据
            draw_text(png, img_changes)
        draw_text(png, img_changes)
        print("输出图片:{}".format(png))
        with open("finished_list.txt", "a+") as f:
            f.write(png + "\n")


from PIL import Image
import os


def combine_imgs_pdf(folder_path, pdf_file_path):
    """
    合成文件夹下的所有图片为pdf
    Args:
        folder_path (str): 源文件夹
        pdf_file_path (str): 输出路径
    """
    with open(finished_list,"r") as f:
        png_list = f.readlines()
    for png in png_list:
        png_list[png_list.index(png)] = png.strip()
    sources = []
    png_list.sort()
    output = Image.open(png_list[0])
    png_list.pop(0)
    for file in png_list:
        png_file = Image.open(file)
        if png_file.mode == "RGB":
            png_file = png_file.convert("RGB")
        sources.append(png_file)
    output.save(pdf_file_path, "pdf", save_all=True, append_images=sources)
    with open(finished_list,"w") as f:
        f.write("")


if __name__ == '__main__':
    from_lang = ["ja", "en"]
    to_lang = ["zh"]
    pdf_name = "ポーズの定理_ダイジェスト.pdf"
    translate_a_pdf(os.path.join(pdf_path, pdf_name), from_lang, "ja", "zh")
    combine_imgs_pdf(pdf2png_save_path, os.path.join(pdf_path, "changed_"+pdf_name))


 

处理一下一些段落,按照段落去识别
rect_dealer.py

import math

import cv2
from PIL import Image
from PIL import Image, ImageDraw, ImageFont


# 定义一个边界表示
class Rec:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.words = ""
        self.acc = 0
        self.line_num = 0

    def __str__(self):  # __str__(self)不可以添加参数(形参)
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)

    def __repr__(self):
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)


def include_other_recs(rec_in: Rec, recs):
    """
    比较矩形REC:rec_in和矩形数组:recs
    比较是否包含其他矩形,如果包含了,返回Ture ,否则返回False,表示不包含其他矩形区域,是单独的表格
    :param rec_in:
    :param recs:
    :return:
    """
    for rec in recs:
        if rec_in != rec:
            if rec_in.x <= rec.x and rec_in.x + rec_in.w >= rec.x + rec.w and rec_in.y <= rec.y \
                    and rec_in.y + rec_in.h >= rec.y + rec.h + 5:
                # print(str(rec) + " in " + str(rec_in))
                return True
    # print(str(rec_in), "------not include other recs------")
    return False


def hole_select(recs):
    results = []
    for rec in recs:
        if not include_other_recs(rec, recs):
            results.append(rec)
    return results


class detectWords(object):
    def __init__(self, src_img, width_max_scale=15, height_max_scale=15):
        self.src_img = src_img
        self.width_scale = width_max_scale
        self.height_scale = height_max_scale

    def run(self):
        if len(self.src_img.shape) == 2:  # 灰度图
            gray_img = self.src_img
        if len(self.src_img.shape) == 3:
            gray_img = cv2.cvtColor(self.src_img, cv2.COLOR_BGR2GRAY)

        # 处理图像,灰度化,二值化
        # erode_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        dilated_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        # eroded = cv2.erode(gray_img.copy(), erode_kernel, 3)
        dilated = cv2.dilate(gray_img.copy(), dilated_kernel, 10)
        return dilated


# 判断是否区域为表格,返回可能包含表格的矩形若干个(它们可能存在重叠包含关系):
def region_hole(image):
    recs = []  # 保存表格结果矩形
    contours_mask, hierarchy_mask = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    draw_img_in = cv2.drawContours(image.copy(), contours_mask, -1, (153, 153, 0), 2, maxLevel=2)
    cv2.imwrite("region_table.png", draw_img_in)

    for contour in contours_mask:  # 遍历轮廓
        # 只保留需要的轮廓,去掉误读的噪点 和 外轮廓
        # 绘制矩形
        area = cv2.contourArea(contour)
        if area < 150:
            # 获取区域的面积,如果小于某个值就忽略,代表是杂线不是表格
            continue
        approx = cv2.approxPolyDP(contour, 3, True)  # 趋近矩形
        x, y, width, height = cv2.boundingRect(approx)  # 得到矩形面积、
        rec = Rec(x, y, width, height)
        recs.append((rec))
    return recs


def draw_rects(png, recs):
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in recs:
        # print(info)
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
    image.save("{}_filled.png".format(png))


def getHoleRects(png_path):
    origin_image = cv2.imread(png_path)
    h_dilated_img = detectWords(origin_image).run()  # 稍微膨胀
    recs = region_hole(h_dilated_img)  # 检测候选洞区域
    results = hole_select(recs)  # 筛选出洞
    draw_rects('region_table.png', results)
    return results

if __name__ == '__main__':
    file_name = r'F:\ocr\pdf2png\1.png_filled.png'
    getHoleRects(file_name)


 

img_text.py (这段代码抄的网上、实现了图片文本换行的效果)

from PIL import Image, ImageDraw, ImageFont


class ImgText:
    def __init__(self, text, font_size, width):
        self.font = ImageFont.truetype(r'‪C:\Windows\Fonts\simhei.ttf', font_size)
        # 预设宽度 可以修改成你需要的图片宽度
        self.width = width
        # 文本
        self.text = text
        # 段落 , 行数, 行高
        self.duanluo, self.note_height, self.line_height = self.split_text()

    def get_duanluo(self, text):
        txt = Image.new('RGBA', (100, 100), (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt)
        # 所有文字的段落
        duanluo = ""
        # 宽度总和
        sum_width = 0
        # 几行
        line_count = 1
        # 行高
        line_height = 0
        for char in text:
            width, height = draw.textsize(char, self.font)
            sum_width += width
            if sum_width > self.width:  # 超过预设宽度就修改段落 以及当前行数
                line_count += 1
                sum_width = 0
                duanluo += '\n'
            duanluo += char
            line_height = max(height, line_height)
        if not duanluo.endswith('\n'):
            duanluo += '\n'
        return duanluo, line_height, line_count

    def split_text(self):
        # 按规定宽度分组
        max_line_height, total_lines = 0, 0
        allText = []
        for text in self.text.split('\n'):
            duanluo, line_height, line_count = self.get_duanluo(text)
            max_line_height = max(line_height, max_line_height)
            total_lines += line_count
            allText.append((duanluo, line_count))
        line_height = max_line_height
        total_height = total_lines * line_height
        return allText, total_height, line_height

    def draw_text(self,note_img,x,y):
        """
    绘图以及文字
    :return:
    """
        draw = ImageDraw.Draw(note_img)
        # 左上角开始
        for duanluo, line_count in self.duanluo:
            draw.text((x, y), duanluo, fill=(255, 0, 0), font=self.font)
            y += self.line_height * line_count
        note_img.save("result.png")

 

步骤:

1.先用easyocr识别文本,easyocr需要下载easyocr的模型,放在cut_model文件夹里

下载地址:https://www.jaided.ai/easyocr/modelhub/ 可能需要科学上w、

2.在这里可以控制easyocr识别的文本语言:

我这里输入ja、en,代表日语(japanese)和英语(english),所以会从图片中检测出日语和英语的文本

3.简单地处理一下块,把一个段落的文本,合并起来

4.输入到翻译模型中,这里可以是任何模型,我试过下面几个模型
绿色框住的是好,其他的由于各种原因,比如太慢、比如性能太差,被我残忍抛弃,
(ps:opus-mt-XX的模型是真的好用,又小又准确,但是它!没有ja-zh,所以……好气!)

例如:m2m100_418M,这个模型在:https://toscode.gitee.com/mirrors_UKPLab/EasyNMT 可以看到,

它的节点和大小没有m2m100_1.2B多,我下载了试了试,真的不能用


这俩的翻译对比:m2m100_418M,右边m2m100_1.2B

性能差了很多,而且会出现奇怪的表现,速度也没有快多少。

模型排行榜
(排行靠前的一大堆,没一个开源的,我只能说,感谢meta,小扎还是良心企业嗷)

网易有道词典小语种翻译实现思路
网易有道的小语种翻译真的很牛,微信在它面前被揍得像个弟弟,可惜模型都不公开,毕竟都是核心资源……

其他语种模型可以去下面的笑脸中心找,很牛的企业,可能需要科学上网,模型太大的话可以用迅雷下载器(或者用别的下载器),

下载器下载能快许多:

也可以用讯飞的api直接就翻译日语了
https://www.xfyun.cn/services/xftrans

给的200万字免费调用,够用一段时间了

m2m100_1.2B模型翻译日文还是有很多不如人意的地方,

例如:

1.速度很慢:慢的我有点受不了了
2.正确率还不够好(虽然也不太差了):

—————————————————————————————
后来换了讯飞的接口试了下,也不怎么样(调用接口还很麻烦)
讯飞翻译:

唯一好使的只有有道图片翻译,感觉错误率明显低;而且提供了任意体验的服务,真的很好,如果不是想一键pdf2pdf,那么用有道去翻译一下也可以。


转载:https://blog.csdn.net/qinglingLS/article/details/128063881
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场