一年前,笔者开源了一个阅读顺序模型(《【文档智能】符合人类阅读顺序的文档模型-LayoutReader及非官方权重开源》),
阅读顺序在文档智能解析中的位置
阅读顺序检测旨在捕获人类读者能够自然理解的单词序列。现有的OCR引擎通常按照从上到下、从左到右的方式排列识别到的文本行,但这并不适用于某些文档类型,如多栏模板、表格等。LayoutReader模型使用seq2seq模型捕获文本和布局信息,用于阅读顺序预测,在实验中表现出色,并显著提高了开源和商业OCR引擎在文本行排序方面的表现。
Github:https://github.com/yujunhuics/LayoutReader
权重地址:https://www.modelscope.cn/models/yujunhuinlp/LayoutReader-only-layout-large
有伙伴私信不知如何使用,笔者通过版式分析的结果,后接开源笔者开源的模型,完善这个技术链路 。供参考。先看效果:
研报版式分析后接阅读顺序,如:reader:1
论文版式分析后接阅读顺序,如:reader:1
详细代码已上传:https://github.com/yujunhuics/LayoutReader/blob/main/vis.py
#!/usr/bin/env python
# \_*\_coding:utf-8\_*\_
# Author : Junhui Yu
from ultralytics import YOLO
import cv2
import torch
from model import LayoutLMv3ForBboxClassification
from collections import defaultdict
CLS\_TOKEN\_ID = 0
UNK\_TOKEN\_ID = 3
EOS\_TOKEN\_ID = 2
def BboxesMasks(boxes):
bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
input\_ids = [CLS\_TOKEN\_ID] + [UNK\_TOKEN\_ID] * len(boxes) + [EOS\_TOKEN\_ID]
attention\_mask = [1] + [1] * len(boxes) + [1]
return {
"bbox": torch.tensor([bbox]),
"attention\_mask": torch.tensor([attention\_mask]),
"input\_ids": torch.tensor([input\_ids]),
}
def decode(logits, length):
logits = logits[1: length + 1, :length]
orders = logits.argsort(descending=False).tolist()
ret = [o.pop() for o in orders]
whileTrue:
order\_to\_idxes = defaultdict(list)
for idx, order in enumerate(ret):
order\_to\_idxes[order].append(idx)
order\_to\_idxes = {k: v for k, v in order\_to\_idxes.items() if len(v) > 1}
ifnot order\_to\_idxes:
break
for order, idxes in order\_to\_idxes.items():
idxes\_to\_logit = {}
for idx in idxes:
idxes\_to\_logit[idx] = logits[idx, order]
idxes\_to\_logit = sorted(
idxes\_to\_logit.items(), key=lambda x: x[1], reverse=True
)
for idx, \_ in idxes\_to\_logit[1:]:
ret[idx] = orders[idx].pop()
return ret
def layoutreader(bboxes):
inputs = BboxesMasks(bboxes)
logits = layoutreader\_model(**inputs).logits.cpu().squeeze(0)
orders = decode(logits, len(bboxes))
return orders
# report label
# id2name = {
# 0: 'Text',
# 1: 'Title',
# 2: 'Header',
# 3: 'Footer',
# 4: 'Figure',
# 5: 'Table',
# 6: 'Toc',
# 7: 'Figure caption',
# 8: 'Table caption',
# 9: 'Equation',
# 10: 'Footnote'
# }
# paper label
id2name = {
0: 'Text',
1: 'Title',
2: 'Figure',
3: 'Figure caption',
4: 'Table',
5: 'Table caption',
6: 'Header',
7: 'Footer',
8: 'Reference',
9: 'Equation'
}
color\_map = {
'Text': (255, 0, 255),
'Title': (0, 255, 0),
'Header': (125, 125, 0),
'Footer': (255, 255, 0),
'Figure': (0, 0, 255),
'Table': (160, 32, 240),
'Toc': (199, 97, 20),
'Figure caption': (255, 90, 50),
'Table caption': (255, 128, 0),
'Equation': (255, 123, 123),
'Footnote': (222, 110, 0)
}
image\_path = 'page\_4.png'
model\_path = "./LayoutReader-only-layout-large"
# 下载地址:https://modelscope.cn/models/yujunhuinlp/LayoutReader-only-layout-large
layoutreader\_model = LayoutLMv3ForBboxClassification.from\_pretrained(model\_path)
layout\_model = YOLO('paper-8n.pt')
# 下载地址:https://huggingface.co/qihoo360/360LayoutAnalysis
# layout\_model = YOLO('report-8n.pt')
result = layout\_model(image\_path, save=False, conf=0.45, save\_crop=False, line\_width=1)
print(result)
img = cv2.imread(image\_path)
page\_h, page\_w = img.shape[:2]
x\_scale = 1000.0 / page\_w
y\_scale = 1000.0 / page\_h
bbox\_cls = result[0].boxes.cls.tolist()
xyxyes = result[0].boxes.xyxy.tolist()
confes = result[0].boxes.conf.tolist()
print(xyxyes)
boxes = []
for left, top, right, bottom in xyxyes:
if left < 0:
left = 0
if right > page\_w:
right = page\_w
if top < 0:
top = 0
if bottom > page\_h:
bottom = page\_h
left = round(left * x\_scale)
top = round(top * y\_scale)
right = round(right * x\_scale)
bottom = round(bottom * y\_scale)
assert (
1000 >= right >= left >= 0and1000 >= bottom >= top >= 0), \
f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}'
boxes.append([left, top, right, bottom])
print(boxes)
orders = layoutreader(boxes)
print(orders)
xyxyes = [xyxyes[i] for i in orders]
bbox\_cls = [bbox\_cls[i] for i in orders]
confes = [confes[i] for i in orders]
print(xyxyes)
for idx, b\_cls, xyxy, conf in zip(range(len(xyxyes)), bbox\_cls, xyxyes, confes):
top\_left\_x, top\_left\_y, bottom\_right\_x, bottom\_right\_y = xyxy[0], xyxy[1], xyxy[2], xyxy[3]
cv2.rectangle(img, (int(top\_left\_x), int(top\_left\_y)), (int(bottom\_right\_x), int(bottom\_right\_y)),
color\_map[id2name[b\_cls]],
2)
cv2.putText(img, f"reader:{idx}--" + id2name[b\_cls] + ":" + str(round(conf, 2)),
(int(top\_left\_x), int(top\_left\_y) + 5),
cv2.FONT\_HERSHEY\_SIMPLEX,
1,
color\_map[id2name[b\_cls]], 3) # Add label text
cv2.imwrite("vis-result.jpg", img)
关于我:余俊晖,主要研究方向为自然语言处理、大语言模型、文档智能。曾获CCF、Kaggle、ICPR、ICDAR、CCL、CAIL等国内外近二十项AI算法竞赛/评测冠亚季军。发表SCI、顶会等文章多篇,专利数项。