效果图：

一些对比：

模型来自于：

https://huggingface.co/models

文件目录

调用模型的代码：

运行此段代码，执行翻译

一些简单的设置在这里控制
pdf2chines.py

import os

import cv2
import easyocr
from PIL import Image
from PIL import Image, ImageDraw, ImageFont

import rect_dealer
from img_text import ImgText

cut_model_path = r"F:\ocr\cut_model"
detect_model_jap_path = r"F:\ocr\meta_model\manga-ocr-base"  # 检测漫画的文本用的，好烂，还不如easyocr
trans_model_path = r"F:\ocr\meta_model\m2m100_1.2B"  # meta的模型
pdf2png_save_path = r"F:\ocr\pdf2png"
pdf_path = r"F:\ocr\pdfs"
pass_point = 0.05
blank_png_path = r"F:\ocr\blank.png"
DEFUALT_FONT_SIZE = 60
MIN_FONT_SIZE = 20

height_sub = 0.1  # 检测到位置后，高度减少一丢丢来找每个文本块
include_height_sub = 0.3
include_width_sub = 0.3
finished_list = "finished_list.txt"


def generate_mask(png, graph_infos):
    """
    生成一张mask图
    :param png:
    :param graph_infos:
    :return:
    """
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in graph_infos:
        pos_info = info[0]
        left_up_point = pos_info[0]  # [939, 791]
        left_down_point = pos_info[3]  # [939, 805]
        right_up_point = pos_info[1]  # [1007, 791]
        right_down_point = pos_info[2]  # [1007, 805]
        up_margin = left_up_point[0]  # 上间距
        left_margin = left_up_point[1]  # 左侧间距
        width = right_up_point[0] - left_up_point[0]
        height = right_down_point[1] - right_up_point[1]
        height_sub_num = height * 0.1
        blank_png = Image.new('RGBA', (int(width), int(height - height_sub_num)), (255, 255, 255))
        image.paste(blank_png, (int(up_margin + height_sub_num), int(left_margin)))
    image.save("{}_filled.png".format(png))
    return "{}_filled.png".format(png)


def merge_neighbers(png, graph_infos):
    """
    需要把邻近的行都合并了
    :param graph_infos:
    :return:
    """
    filled_path = generate_mask(png, graph_infos)  # 生成mask图
    rects = rect_dealer.getHoleRects(filled_path)  # 获取分割关系
    for info in graph_infos:
        # 检测包含关系
        detect_include(rects, info[0], info[-2], info[-1])
    return rects


def detect_include(rects, pos_info, words, acc):
    left_up_point = pos_info[0]  # [939, 791]
    left_down_point = pos_info[3]  # [939, 805]
    right_up_point = pos_info[1]  # [1007, 791]
    right_down_point = pos_info[2]  # [1007, 805]
    up_margin = left_up_point[0]  # 上间距
    left_margin = left_up_point[1]  # 左侧间距
    width = right_up_point[0] - left_up_point[0]
    height = right_down_point[1] - right_up_point[1]
    height_sub_num = min(height * include_height_sub, 20)
    width_sub_num = min(include_width_sub * width, 10)
    for rect in rects:
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h:
            print("minus:{},{}".format(rect, pos_info))
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h and left_up_point[0] > \
                rect.x - width_sub_num and left_up_point[1] > rect.y - height_sub_num:
            rect.words += words
            rect.acc += float(acc)
            rect.acc /= 2.0
            rect.line_num += 1  # 行数+1
            # print("include:{},{}".format(rect, pos_info))
            return


def change_graph2words(graph_path, languages):
    """
    图片转成词
    :param graph_path:
    :param languages:
    :return:
    """
    reader = easyocr.Reader(languages, model_storage_directory=cut_model_path, download_enabled=False, gpu=True)
    result = reader.readtext(graph_path)
    return result


def words2chinese(words, from_lang, tgt_lang):
    from transformers import pipeline
    translator = pipeline("translation", model=trans_model_path)
    to_trans = "".join(words)
    output = translator(to_trans, src_lang=from_lang, tgt_lang=tgt_lang)
    print("翻译原文：{}\n翻译结果：{}".format(to_trans, output))
    return output


def pdf2png(pdf_name):
    import fitz
    #  打开PDF文件，生成一个对象
    doc = fitz.open('{}'.format(pdf_name))
    png_paths = []
    for pg in range(doc.page_count):
        page = doc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为2，这将为我们生成分辨率提高四倍的图像。
        zoom_x = 1.0
        zoom_y = 1.0
        trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
        pm = page.get_pixmap(matrix=trans, alpha=False)
        graph_path = os.path.join(pdf2png_save_path, '%s.png' % pg)
        pm.save(graph_path, output="png")
        png_paths.append(graph_path)
    return png_paths


def line_sep(sentense, line_num):
    sep = int(len(sentense) / line_num)
    new_sen = ""
    next_start_index = 0
    for i in range(0, line_num):
        new_sen += sentense[next_start_index:line_num + sep].strip()
        new_sen += "\n"
        next_start_index = line_num + sep
    new_sen += sentense[next_start_index:]
    return new_sen


def draw_text(png, infos):
    image = Image.open(png)
    for info in infos:
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
        n = ImgText(info.words, max(min(int(min(info.w, info.h) / (1.5 * info.line_num)), DEFUALT_FONT_SIZE)
                                    , MIN_FONT_SIZE), info.w)
        n.draw_text(image, info.x, info.y)
    image.save("{}".format(png))


def clear_png_files():
    pass


def translate_a_pdf(pdf_path, detectlang: list, translate_from_lang: str, translate_to_lang: str):
    with open(finished_list, "r") as f:
        finished = f.readlines()
    for finish in finished:
        finished[finished.index(finish)] = finish.strip()
    clear_png_files()  # 先清空png文件夹下面的全部图，然后就可以转换当前pdf的图了
    png_paths = pdf2png(pdf_path)

    for png in png_paths:
        if png in finished:
            continue
        img_changes = []
        words_result = change_graph2words(png, detectlang)
        print("查找到的文本：{}".format(words_result))
        rects = merge_neighbers(png, words_result)  # 合并段
        for rect in rects:
            if float(rect.acc) < pass_point:
                print("认为这个词正确度{}极低,不进行翻译：{}".format(rect.acc, rect.words))
                continue
            transed_words = words2chinese(rect.words, translate_from_lang, translate_to_lang)
            translation_text = ""
            for trans in transed_words:
                translation_text += trans["translation_text"]
            rect.words = translation_text
            print("存储位置：{}".format(str(rect)))
            img_changes.append(rect)  # 更新一下图像数据
            draw_text(png, img_changes)
        draw_text(png, img_changes)
        print("输出图片：{}".format(png))
        with open("finished_list.txt", "a+") as f:
            f.write(png + "\n")


from PIL import Image
import os


def combine_imgs_pdf(folder_path, pdf_file_path):
    """
    合成文件夹下的所有图片为pdf
    Args:
        folder_path (str): 源文件夹
        pdf_file_path (str): 输出路径
    """
    with open(finished_list,"r") as f:
        png_list = f.readlines()
    for png in png_list:
        png_list[png_list.index(png)] = png.strip()
    sources = []
    png_list.sort()
    output = Image.open(png_list[0])
    png_list.pop(0)
    for file in png_list:
        png_file = Image.open(file)
        if png_file.mode == "RGB":
            png_file = png_file.convert("RGB")
        sources.append(png_file)
    output.save(pdf_file_path, "pdf", save_all=True, append_images=sources)
    with open(finished_list,"w") as f:
        f.write("")


if __name__ == '__main__':
    from_lang = ["ja", "en"]
    to_lang = ["zh"]
    pdf_name = "ポーズの定理_ダイジェスト.pdf"
    translate_a_pdf(os.path.join(pdf_path, pdf_name), from_lang, "ja", "zh")
    combine_imgs_pdf(pdf2png_save_path, os.path.join(pdf_path, "changed_"+pdf_name))

处理一下一些段落，按照段落去识别
rect_dealer.py

import math

import cv2
from PIL import Image
from PIL import Image, ImageDraw, ImageFont


# 定义一个边界表示
class Rec:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.words = ""
        self.acc = 0
        self.line_num = 0

    def __str__(self):  # __str__(self)不可以添加参数(形参)
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)

    def __repr__(self):
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)


def include_other_recs(rec_in: Rec, recs):
    """
    比较矩形REC：rec_in和矩形数组：recs
    比较是否包含其他矩形,如果包含了,返回Ture ,否则返回False，表示不包含其他矩形区域，是单独的表格
    :param rec_in:
    :param recs:
    :return:
    """
    for rec in recs:
        if rec_in != rec:
            if rec_in.x <= rec.x and rec_in.x + rec_in.w >= rec.x + rec.w and rec_in.y <= rec.y \
                    and rec_in.y + rec_in.h >= rec.y + rec.h + 5:
                # print(str(rec) + " in " + str(rec_in))
                return True
    # print(str(rec_in), "------not include other recs------")
    return False


def hole_select(recs):
    results = []
    for rec in recs:
        if not include_other_recs(rec, recs):
            results.append(rec)
    return results


class detectWords(object):
    def __init__(self, src_img, width_max_scale=15, height_max_scale=15):
        self.src_img = src_img
        self.width_scale = width_max_scale
        self.height_scale = height_max_scale

    def run(self):
        if len(self.src_img.shape) == 2:  # 灰度图
            gray_img = self.src_img
        if len(self.src_img.shape) == 3:
            gray_img = cv2.cvtColor(self.src_img, cv2.COLOR_BGR2GRAY)

        # 处理图像，灰度化，二值化
        # erode_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        dilated_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        # eroded = cv2.erode(gray_img.copy(), erode_kernel, 3)
        dilated = cv2.dilate(gray_img.copy(), dilated_kernel, 10)
        return dilated


# 判断是否区域为表格,返回可能包含表格的矩形若干个（它们可能存在重叠包含关系）：
def region_hole(image):
    recs = []  # 保存表格结果矩形
    contours_mask, hierarchy_mask = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    draw_img_in = cv2.drawContours(image.copy(), contours_mask, -1, (153, 153, 0), 2, maxLevel=2)
    cv2.imwrite("region_table.png", draw_img_in)

    for contour in contours_mask:  # 遍历轮廓
        # 只保留需要的轮廓，去掉误读的噪点 和 外轮廓
        # 绘制矩形
        area = cv2.contourArea(contour)
        if area < 150:
            # 获取区域的面积，如果小于某个值就忽略，代表是杂线不是表格
            continue
        approx = cv2.approxPolyDP(contour, 3, True)  # 趋近矩形
        x, y, width, height = cv2.boundingRect(approx)  # 得到矩形面积、
        rec = Rec(x, y, width, height)
        recs.append((rec))
    return recs


def draw_rects(png, recs):
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in recs:
        # print(info)
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
    image.save("{}_filled.png".format(png))


def getHoleRects(png_path):
    origin_image = cv2.imread(png_path)
    h_dilated_img = detectWords(origin_image).run()  # 稍微膨胀
    recs = region_hole(h_dilated_img)  # 检测候选洞区域
    results = hole_select(recs)  # 筛选出洞
    draw_rects('region_table.png', results)
    return results

if __name__ == '__main__':
    file_name = r'F:\ocr\pdf2png\1.png_filled.png'
    getHoleRects(file_name)

img_text.py （这段代码抄的网上、实现了图片文本换行的效果）

from PIL import Image, ImageDraw, ImageFont


class ImgText:
    def __init__(self, text, font_size, width):
        self.font = ImageFont.truetype(r'‪C:\Windows\Fonts\simhei.ttf', font_size)
        # 预设宽度 可以修改成你需要的图片宽度
        self.width = width
        # 文本
        self.text = text
        # 段落 , 行数, 行高
        self.duanluo, self.note_height, self.line_height = self.split_text()

    def get_duanluo(self, text):
        txt = Image.new('RGBA', (100, 100), (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt)
        # 所有文字的段落
        duanluo = ""
        # 宽度总和
        sum_width = 0
        # 几行
        line_count = 1
        # 行高
        line_height = 0
        for char in text:
            width, height = draw.textsize(char, self.font)
            sum_width += width
            if sum_width > self.width:  # 超过预设宽度就修改段落 以及当前行数
                line_count += 1
                sum_width = 0
                duanluo += '\n'
            duanluo += char
            line_height = max(height, line_height)
        if not duanluo.endswith('\n'):
            duanluo += '\n'
        return duanluo, line_height, line_count

    def split_text(self):
        # 按规定宽度分组
        max_line_height, total_lines = 0, 0
        allText = []
        for text in self.text.split('\n'):
            duanluo, line_height, line_count = self.get_duanluo(text)
            max_line_height = max(line_height, max_line_height)
            total_lines += line_count
            allText.append((duanluo, line_count))
        line_height = max_line_height
        total_height = total_lines * line_height
        return allText, total_height, line_height

    def draw_text(self,note_img,x,y):
        """
    绘图以及文字
    :return:
    """
        draw = ImageDraw.Draw(note_img)
        # 左上角开始
        for duanluo, line_count in self.duanluo:
            draw.text((x, y), duanluo, fill=(255, 0, 0), font=self.font)
            y += self.line_height * line_count
        note_img.save("result.png")

步骤：

1.先用easyocr识别文本，easyocr需要下载easyocr的模型，放在cut_model文件夹里

下载地址：https://www.jaided.ai/easyocr/modelhub/ 可能需要科学上w、

2.在这里可以控制easyocr识别的文本语言：

我这里输入ja、en，代表日语（japanese）和英语（english），所以会从图片中检测出日语和英语的文本

3.简单地处理一下块，把一个段落的文本，合并起来

4.输入到翻译模型中，这里可以是任何模型，我试过下面几个模型
绿色框住的是好，其他的由于各种原因，比如太慢、比如性能太差，被我残忍抛弃，
（ps：opus-mt-XX的模型是真的好用，又小又准确，但是它！没有ja-zh，所以……好气！）

例如：m2m100_418M，这个模型在：https://toscode.gitee.com/mirrors_UKPLab/EasyNMT 可以看到，