docx
文档内容
先建个word文档,并保存为./output.docx
。
from docx import Document
from docx.shared import Inches
def add_table(doc):
"""向word文档中添加表格"""
table = doc.add_table(rows=3, cols=3, style='Table Grid')
# 填充表格数据
row_count = 0
for row in table.rows:
col_count = 0
for cell in row.cells:
if row_count == 0: # 第一行加header
cell.text = f'Header {row_count + 1}-{col_count + 1}'
else:
cell.text = f'Cell {row_count + 1}-{col_count + 1}'
col_count += 1
row_count += 1
# 设置单元格宽度
table.cell(0, 0).width = Inches(1.5)
table.cell(0, 1).width = Inches(2.0)
table.cell(0, 2).width = Inches(1.5)
def make_doc():
"""生成word文档"""
doc = Document()
first = doc.add_paragraph('第一行')
# 设置“第一行”为Heading 1样式
first.style = doc.styles['Heading 1']
add_table(doc)
doc.add_paragraph('我的文档标题')
add_table(doc)
doc.add_paragraph('结束')
doc.add_paragraph('这里是结束后')
add_table(doc) # 验证结果是否正确
# 保存文档
doc.save('output.docx')
🚁 获取指定范围内的表格
class DocxReader:
def __init__(self, file: str):
self.document = Document(file) # openxml
self.paragraphs = self.document.paragraphs
self.all_tables = self.document.tables
def get_table_by_sections(self, start_str: str, end_str='') -> dict:
"""
获取docx文件中指定范围的所有表格id,并取表格上一行文本作为表名
:param start_str: 起始行文本,精准匹配
:param end_str: 结束行文本,精准匹配;无传参时从start_str下方开始查找最近的表格id
:return: {tableid:tablename,}
"""
find_tbls = {}
# 获取指定范围的所有表格ele,存储为{ele:表名}格式字典
for aPara in self.paragraphs:
if aPara.text == start_str:
ele = aPara._p
if end_str:
# 当end_str非空时,遍历start_str至end_str章节内所有表格id
while ele.text != end_str:
ele = ele.getnext()
if ele.tag[-3:] == 'tbl':
_ = ele.getprevious().text # 取表格上一行文本作为表名
find_tbls[ele] = _
else:
# 当end_str为空时,从start_str下方开始查找最近的表格id
while ele.tag[-3:] != 'tbl':
ele = ele.getnext()
_ = ele.getprevious().text # 取表格上一行文本作为表名
find_tbls[ele] = _
break
return find_tbls
def get_sepcified_table_content(self, start_str: str, end_str='') -> dict:
"""
获取docx文件中指定表格内容
:param start_str: 起始行文本,精准匹配
:param end_str: 结束行文本,精准匹配;无传参时从start_str下方开始查找最近的表格id
:return: {tablename:[ (content[0]),(content[1]) ... ],}
"""
find_tbls = self.get_table_by_sections(start_str=start_str, end_str=end_str)
res = {}
# 所有表格遍历,进行匹配存储为{表名:[(第n行内容)]}格式字典
for table in self.all_tables:
if table._tbl in find_tbls.keys():
tmp = find_tbls.get(table._tbl)
res_table = []
for row in table.rows:
res_row = tuple(col.text for col in row.cells)
res_table.append(res_row)
res[tmp] = res_table
return res
if __name__ == '__main__':
make_doc() # 生成文件至./output.docx
dr = DocxReader('output.docx')
res = dr.get_sepcified_table_content('第一行', '结束')
for k, v in res.items():
print(f'表名:{k}\n表内容:{v}\n')
res = dr.get_sepcified_table_content('结束')
for k, v in res.items():
print(f'表名:{k}\n表内容:{v}\n')
表名:第一行
表内容:[('Header 1-1', 'Header 1-2', 'Header 1-3'), ('Cell 2-1', 'Cell 2-2', 'Cell 2-3'), ('Cell 3-1', 'Cell 3-2', 'Cell 3-3')]
表名:我的文档标题
表内容:[('Header 1-1', 'Header 1-2', 'Header 1-3'), ('Cell 2-1', 'Cell 2-2', 'Cell 2-3'), ('Cell 3-1', 'Cell 3-2', 'Cell 3-3')]
表名:这里是结束后
表内容:[('Header 1-1', 'Header 1-2', 'Header 1-3'), ('Cell 2-1', 'Cell 2-2', 'Cell 2-3'), ('Cell 3-1', 'Cell 3-2', 'Cell 3-3')]
🚁 遍历文档所有h1标题的内容
if __name__ == '__main__':
make_doc() # 生成文件至./output.docx
dr = DocxReader('output.docx')
for paragraph in dr.paragraphs:
if 'Heading 1' in paragraph.style.name:
print(paragraph.text)
第一行