【文档智能】开源的阅读顺序（Layoutreader）模型使用指南 - 文章 - 开发者社区

一年前，笔者开源了一个阅读顺序模型（《【文档智能】符合人类阅读顺序的文档模型-LayoutReader及非官方权重开源》），

picture.image

阅读顺序在文档智能解析中的位置

阅读顺序检测旨在捕获人类读者能够自然理解的单词序列。现有的OCR引擎通常按照从上到下、从左到右的方式排列识别到的文本行，但这并不适用于某些文档类型，如多栏模板、表格等。LayoutReader模型使用seq2seq模型捕获文本和布局信息，用于阅读顺序预测，在实验中表现出色，并显著提高了开源和商业OCR引擎在文本行排序方面的表现。

Github:https://github.com/yujunhuics/LayoutReader
权重地址:https://www.modelscope.cn/models/yujunhuinlp/LayoutReader-only-layout-large

有伙伴私信不知如何使用，笔者通过版式分析的结果，后接开源笔者开源的模型，完善这个技术链路 。供参考。先看效果：

picture.image

研报版式分析后接阅读顺序，如：reader：1

picture.image

论文版式分析后接阅读顺序，如：reader：1

详细代码已上传：https://github.com/yujunhuics/LayoutReader/blob/main/vis.py

  
#!/usr/bin/env python  
# \_*\_coding:utf-8\_*\_  
# Author   :    Junhui Yu  
  
from ultralytics import YOLO  
import cv2  
import torch  
from model import LayoutLMv3ForBboxClassification  
from collections import defaultdict  
  
CLS\_TOKEN\_ID = 0  
UNK\_TOKEN\_ID = 3  
EOS\_TOKEN\_ID = 2  
  
  
def BboxesMasks(boxes):  
    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]  
    input\_ids = [CLS\_TOKEN\_ID] + [UNK\_TOKEN\_ID] * len(boxes) + [EOS\_TOKEN\_ID]  
    attention\_mask = [1] + [1] * len(boxes) + [1]  
    return {  
        "bbox": torch.tensor([bbox]),  
        "attention\_mask": torch.tensor([attention\_mask]),  
        "input\_ids": torch.tensor([input\_ids]),  
    }  
  
  
def decode(logits, length):  
    logits = logits[1: length + 1, :length]  
    orders = logits.argsort(descending=False).tolist()  
    ret = [o.pop() for o in orders]  
    whileTrue:  
        order\_to\_idxes = defaultdict(list)  
        for idx, order in enumerate(ret):  
            order\_to\_idxes[order].append(idx)  
        order\_to\_idxes = {k: v for k, v in order\_to\_idxes.items() if len(v) > 1}  
        ifnot order\_to\_idxes:  
            break  
        for order, idxes in order\_to\_idxes.items():  
            idxes\_to\_logit = {}  
            for idx in idxes:  
                idxes\_to\_logit[idx] = logits[idx, order]  
            idxes\_to\_logit = sorted(  
                idxes\_to\_logit.items(), key=lambda x: x[1], reverse=True  
            )  
            for idx, \_ in idxes\_to\_logit[1:]:  
                ret[idx] = orders[idx].pop()  
    return ret  
  
  
def layoutreader(bboxes):  
    inputs = BboxesMasks(bboxes)  
    logits = layoutreader\_model(**inputs).logits.cpu().squeeze(0)  
    orders = decode(logits, len(bboxes))  
    return orders  
  
  
# report label  
# id2name = {  
#     0: 'Text',  
#     1: 'Title',  
#     2: 'Header',  
#     3: 'Footer',  
#     4: 'Figure',  
#     5: 'Table',  
#     6: 'Toc',  
#     7: 'Figure caption',  
#     8: 'Table caption',  
#     9: 'Equation',  
#     10: 'Footnote'  
# }  
  
# paper label  
id2name = {  
    0: 'Text',  
    1: 'Title',  
    2: 'Figure',  
    3: 'Figure caption',  
    4: 'Table',  
    5: 'Table caption',  
    6: 'Header',  
    7: 'Footer',  
    8: 'Reference',  
    9: 'Equation'  
}  
  
color\_map = {  
    'Text': (255, 0, 255),  
    'Title': (0, 255, 0),  
    'Header': (125, 125, 0),  
    'Footer': (255, 255, 0),  
    'Figure': (0, 0, 255),  
    'Table': (160, 32, 240),  
    'Toc': (199, 97, 20),  
    'Figure caption': (255, 90, 50),  
    'Table caption': (255, 128, 0),  
    'Equation': (255, 123, 123),  
    'Footnote': (222, 110, 0)  
}  
  
image\_path = 'page\_4.png'  
  
  
model\_path = "./LayoutReader-only-layout-large"  
# 下载地址：https://modelscope.cn/models/yujunhuinlp/LayoutReader-only-layout-large  
  
layoutreader\_model = LayoutLMv3ForBboxClassification.from\_pretrained(model\_path)  
  
layout\_model = YOLO('paper-8n.pt')  
# 下载地址：https://huggingface.co/qihoo360/360LayoutAnalysis  
# layout\_model = YOLO('report-8n.pt')  
  
result = layout\_model(image\_path, save=False, conf=0.45, save\_crop=False, line\_width=1)  
print(result)  
  
img = cv2.imread(image\_path)  
page\_h, page\_w = img.shape[:2]  
  
x\_scale = 1000.0 / page\_w  
y\_scale = 1000.0 / page\_h  
  
bbox\_cls = result[0].boxes.cls.tolist()  
xyxyes = result[0].boxes.xyxy.tolist()  
confes = result[0].boxes.conf.tolist()  
print(xyxyes)  
  
boxes = []  
for left, top, right, bottom in xyxyes:  
    if left < 0:  
        left = 0  
    if right > page\_w:  
        right = page\_w  
    if top < 0:  
        top = 0  
    if bottom > page\_h:  
        bottom = page\_h  
  
    left = round(left * x\_scale)  
    top = round(top * y\_scale)  
    right = round(right * x\_scale)  
    bottom = round(bottom * y\_scale)  
    assert (  
            1000 >= right >= left >= 0and1000 >= bottom >= top >= 0), \  
        f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}'  
    boxes.append([left, top, right, bottom])  
  
print(boxes)  
orders = layoutreader(boxes)  
print(orders)  
xyxyes = [xyxyes[i] for i in orders]  
bbox\_cls = [bbox\_cls[i] for i in orders]  
confes = [confes[i] for i in orders]  
print(xyxyes)  
  
for idx, b\_cls, xyxy, conf in zip(range(len(xyxyes)), bbox\_cls, xyxyes, confes):  
    top\_left\_x, top\_left\_y, bottom\_right\_x, bottom\_right\_y = xyxy[0], xyxy[1], xyxy[2], xyxy[3]  
    cv2.rectangle(img, (int(top\_left\_x), int(top\_left\_y)), (int(bottom\_right\_x), int(bottom\_right\_y)),  
                  color\_map[id2name[b\_cls]],  
                  2)  
    cv2.putText(img, f"reader:{idx}--" + id2name[b\_cls] + ":" + str(round(conf, 2)),  
                (int(top\_left\_x), int(top\_left\_y) + 5),  
                cv2.FONT\_HERSHEY\_SIMPLEX,  
                1,  
                color\_map[id2name[b\_cls]], 3)  # Add label text  
cv2.imwrite("vis-result.jpg", img)

关于我：余俊晖，主要研究方向为自然语言处理、大语言模型、文档智能。曾获CCF、Kaggle、ICPR、ICDAR、CCL、CAIL等国内外近二十项AI算法竞赛/评测冠亚季军。发表SCI、顶会等文章多篇，专利数项。