Python提取docx文档中例题、插图、表格清单
from docx import Document
import re
result = {'li':[], 'fig':[], 'tab':[], 'tuozhan':[]}
doc = Document(r'C:\test.docx')
for p in doc.paragraphs:
t = p.text #获取每一段的文本
if re.match('例\d+-\d+ ', t):
result['li'].append(t)
elif re.match('图\d+-\d+ ', t):
result['fig'].append(t)
elif re.match('表\d+-\d+ ', t):
result['tab'].append(t)
print('='*30)
for li in result['li']:
print(li)
print('='*30)
for fig in result['fig']:
print(fig)
print('='*30)
for tab in result['tab']:
print(tab)