该博客详细介绍了如何使用Python的`python-docx`库来读写Word文档,包括读取段落、指定段落、块、标题、正文以及样式。还展示了如何写入标题、正文、分页符、图片、表格,以及进行对齐、删除、插入和格式化操作。此外,还涵盖了将Word批量转换为PDF的方法。
- pip install python-docx
-
读全部word段落
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx') # word文件路径
- for i in a.paragraphs: # word对象的item
- print(i.text)
-
读指定word段落
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
- for i in a.paragraphs[0:3]: # 可以对word对象切片操作
- print(i.text)
-
读word中每段落中的块
- #块就是一个段落中的不同格式的文字
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
- b = a.paragraphs[0] # 取第一段落 -- 返回列表
- c = b.runs # 取第一段落中每个块 -- 返回列表
- for i in c: # 遍历段落中所有的对象,
- print(i.text)
-
读word操作示例
- #计算文档中所有段落出现'1'的次数
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
- numbers = 0
-
- for i in a.paragraphs: # 这个只能拿到段落,(注意不能拿到表格)
- if '1' in i.text:
- numbers += 1
- print(numbers)
-
-
-
-
-
-
-
- #计算文档中的表格出现'1'的次数
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
-
- numbers = 0
- for i in a.tables: #拿所有表格
- for 行 in i.rows: #根据'行'遍历,拿到每一行的数据
- for 单元格 in 行.cells: #遍历'行',拿到每个单元格的数据
- if '0' in 单元格.text:
- numbers += 1
- print(numbers)
-
只读word中标题
- # 读段落中的指定标题
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
- for i in a.paragraphs:
- if i.style.name == 'Heading 1': # Heading 1代表1级标题, 同理 Heading 2 代表2级,最高到9级
- print(i.text)
-
-
-
-
-
-
-
- #读所有标题(模块中暂无方法支持,这里用正则去写)
- from docx import Document
- import re
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx')
- for i in a.paragraphs:
- if re.match('^Heading \d+$', i.style.name): # 正则匹配数字
- print(i.text)
-
只读word中正文
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx') # word文件路径
- for i in a.paragraphs: # word对象的item
- if i.style.name == 'Normal': # Normal 代表正文
- print(i.text)
-
读word中的所有样式类
- from docx import Document
- from docx.enum.style import WD_STYLE_TYPE
-
- a = Document(r'C:\Users\Administrator\Desktop\test1.docx') # word文件路径
- title = a.styles
- for i in title:
- if i.type == WD_STYLE_TYPE.PARAGRAPH:
- print(i.name)
-
#写入标题
- from docx import Document
-
- a = Document()
- a.add_heading('添加的一级标题', level=0) # add_heading('标题内容', 标题的级数(支持0-9))
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
写入正文(段落)
- from docx import Document
-
- a = Document()
- a.add_paragraph('正文')
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
写入分页符
- from docx import Document
-
- a = Document()
- a.add_page_break() # 跳到下一页(分页符)
- a.add_paragraph('正文')
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
写入正文中的块
- from docx import Document
-
- a = Document()
- b = a.add_paragraph('我是正文在我后面添加的文字会被设置格式:')
- b.add_run('加粗').bold = True
- b.add_run('普通')
- b.add_run('斜体').italic = True
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
定位word中某个段落进行操作
- #定位方法
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\111.docx')
- print(len(a.paragraph)) # 总共有多少段落
- b = a.paragraph[0] # 定位到第一段
- print(b.text)
-
-
-
-
-
-
-
- #在段落中某一段落之前插入新的段落
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\111.docx')
- print(len(a.paragraph)) # 总共有多少段落
- b = a.paragraph[0] # 定位到第一段
- b.insert_paragraph_before('这是新添加的段落在第一段之前')
- b.save(r'C:\Users\Administrator\Desktop\111.docx')
-
插入图片且设置图片的大小
- from docx import Document
- from docx.shared import Cm # 设置图片宽高的类
-
- a = Document()
- a.add_picture(r'C:\Users\Administrator\Desktop\微信图片_20210212222846.jpg', width=Cm(13), height=Cm(8))
- # 添加图片宽高
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
给word中表格中添加图片
- from docx import Document
- from docx.shared import Cm
-
- a = Document()
- run = a.tables[0].cell(0, 0).paragraphs[0].add_run()
- # 文档中第一个表格中0行0列的单元格,中第一个段落中添加一个块
- run.add_picture(r'C:\Users\Administrator\Desktop\微信图片_20210212222846.jpg', width=Cm(13), height=Cm(8))
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
-
#删除图片
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\111.docx')
-
- b = a.paragraphs[1]
- photo = len(a.inline_shapes)
- b.clear() # 注意这里只是删除了第2个段落中的图片
-
插入多张图片,以其中某一张图片的比例为基准插入图片
- from docx import Document
-
- a = Document()
-
- a1 = a.paragraphs[0].add_run().add_picture('图片1路径.jpg')
- a2 = a.paragraphs[1].add_run().add_picture('图片2路径.jpg')
-
- a1.height = int(a.inline_shapes[0].height * (a.inline_shapes[1].height / a.inline_shapes[0].height))
- a1.widht = int(a.inline_shapes[0].widht * (a.inline_shapes[1].widht / a.inline_shapes[0].widht))
- # 逻辑: 图1高 = (图1的高 * (图1高 / 图2高))
- a.save(r'C:\Users\Administrator\Desktop\111.docx')
- print(a.inline_shapes[0].height) # 第一个图片的高度
- print(a.inline_shapes[1].height) # 第二个图片的高度
-
对齐操作
- from docx import Document
- from docx.shared import Cm
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
-
- a = Document()
- photo = a.add_picture(r'C:\Users\Administrator\Desktop\微信图片_20210212222846.jpg', width=Cm(5), height=Cm(5))
- a.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
- # CENTER(居中) LEFT(左对齐) RIGHT(右对齐) JUSTIFY(两端对齐) DISTRIBUTE(分散对齐)
-
新建表格
- from docx import Document
-
- a = Document()
- a1 = [
- ['姓名', '性别', '年龄'],
- ['王大锤', '男', 20],
- ['白元芳', '女', 25]
- ]
- r = 3 # 行
- c = 3
- tables = a.add_table(rows=r, cols=c)
- for rs in range(r):
- r_num = tables.rows[rs].cells # 每一行的所有单元格
- for cs in range(c):
- r_num[cs].text = str(a1[rs][cs])
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
在原表的基础上增加表的行列数
- from docx import Document
- from docx.shared import Cm
-
- a = Document(r'C:\Users\Administrator\Desktop\11.docx')
- a.tables[0].add_row() # 在原表的基础上增加一行
- a.tables[0].add_column(Cm(5)) # 添加列必须设置列宽
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
查看表中的行列数量
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\11.docx')
- print(len(a.tables[0].rows)) # 多少行
- print(len(a.tables[0].columns)) # 多少列
-
- row = a.tables[0].rows[0] # 第一行 # 定位元素
- col = a.tables[0].columns[0] # 第一列
-
删除表中数据
- #删除指定行
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\11.docx')
- print(len(a.tables)) # 查看有多少表格
-
- table1 = a.tables[0] # 定位为第一张表
- print(len(table1.rows), len(table1.columns)) # 查看行数与列数
-
- t2 = table1.rows[1] # 定位到第二行
- t2._element.getparent().remove(t2._element) # 删除指定行
- print(print(len(table1.rows), len(table1.columns)))
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
-
-
-
-
- #删除指定列
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\11.docx')
- print(len(a.tables)) # 查看有多少表格
-
- table1 = a.tables[0] # 定位为第一张表
- cl = table1.columns[0] # 第一列
- for i in cl.cells:
- i._element.getparent().remove(i._element) # 删除第一列的所有数据
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
-
-
-
-
-
-
- #删除整张表
- from docx import Document
-
- a = Document(r'C:\Users\Administrator\Desktop\11.docx')
- t1 = a.tables[0]
- t1._element.getparent().remove(t1._element)
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
给表格中指定单元格中录入内容
- #方法1(不可做格式类的操作,比如对齐)
- from docx import Document
-
- a = Document()
- t1 = a.add_table(3, 3)
- t1.cell(0, 0).text = '赋值' # 0行0列
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
-
-
-
-
-
- #方法2
- from docx import Document
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
-
- a = Document()
- t1 = a.add_table(3, 3)
- pa = t1.cell(0, 2).paragraphs[0] # 0行 2列 第一个段落添加
- pa.text = '方法2'
- pa.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER #居中对齐
- a.save(r'C:\Users\Administrator\Desktop\11.docx')
-
-
-
-
-
-
- #方法3 (从excel中拿内容录入word)
- import pandas as pd
- from docx import Document
-
- a = pd.read_excel('excel路径', header=None)
- file = Document('word路径')
- t = file.add_table(3, 4)
- for i in range(3):
- for c in range(4):
- t.cee(i, c).text = str(a.iloc[i, c])
- file.save('保存后的路径.docx')
-
删除表中指定单元格的内容(删整行/整列)
- 方法1(单个单元格)
- from docx import Document
-
- a = Document('word路径')
- t = a.tables[0] # 第一张表
- t.cell(0, 0).text = ''
-
-
-
-
-
-
-
- 方法2(单个单元格)
- from docx import Document
-
- a = Document('word路径')
- t = a.tables[0]
- b = t.cell(0, 1).paragraphs[0].text = ''
- a.save()
-
-
-
- 方法3(删整行)
- from docx import Document
-
- a = Document('word路径')
- t = a.tables[0]
- for i in t.rows[0].cells:
- i.text = ''
- a.save()
-
-
-
-
-
-
-
- 方法4(删整列)
- from docx import Document
-
- a = Document('word路径')
- t = a.tables[0]
- for i in t.columns[0].cells:
- i.text = ''
- a.save()
-
表的对齐方式
- from docx import Document
- from docx.enum.table import WD_TABLE_ALIGNMENT # 表格对齐方式
-
- a = Document('word路径')
- t = a.tables[0] # 取文档中第一个表格
- t.alignment = WD_TABLE_ALIGNMENT.RIGHT # RIGHT(右) CENTER(居中) RIGHT(右)
- a.save('c:/14.docx')
-
调整表格的列宽(行高)
- #列宽
- from docx import Document
- from docx.shared import Cm # 单位转换函数
-
- a = Document('word路径')
- t = a.tables[0]
- for 列 in t.columns:
- for 单元格 in 列.cells:
- 单元格.width = Cm(1)
- a.save('c:/14.docx')
-
-
-
-
-
- #行高
- from docx import Document
- from docx.shared import Cm # 单位转换函数
-
- a = Document('word路径')
- t = a.tables[0]
- for 行 in t.rows:
- for 单元格 in 行.cells:
- 单元格.height = Cm(1)
- a.save('c:/14.docx')
-
表格中单元格的对齐方式
- from docx import Document
- from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT # 单元格垂直对齐
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT # 段落对齐
-
- a = Document('文件路径')
- t = a.tables[0]
- cl = t.cell(0, 1) # 指定第一行第二列的单元格
- cl.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.TOP # 顶部对齐
- cl.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 段落居中对齐
- a.save('c:/11.docx')
-
- #查询代码中所有表格样式(生成表格样式)
- from docx.enum.style import WD_STYLE_TYPE
- from docx import Document
- a = Document()
- styles = a.styles
-
- for i in styles:
- if i.type == WD_STYLE_TYPE.TABLE:
- a.add_paragraph('表格样式:' + i.name)
- t = a.add_table(3, 3, style=i)
- c5 = t.rows[0].cells
- c5[0].text = '第一列内容'
- c5[1].text = '第二列内容'
- c5[2].text = '第三列内容'
- a.add_paragraph('\n')
- a.save(r'C:\Users\Administrator\Desktop\1.docx')
-
-
-
-
-
-
- #创建带线的表格(参考上面的代码)
- from docx import Document
- a = Document()
- t = a.add_table(3, 3, style='Medium Grid 1 Accent 1') # 根据上面的代码参考样式
- t_name = t.rows[0].cells
- t_name[0].text = '姓名'
- t_name[1].text = '性别'
- t_name[2].text = '年龄'
- a.save('文件路径.docx')
-
-
-
- # 修改单元格所有的字体
- for 行 in t.rows:
- for 单元格 in 行.cells:
- for 段落 in 单元格.paragraphs:
- for 块 in 段落.runs:
- 块.font.name = 'Arial' # 英文字体设置
- 块._element.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑') # 设置中文字体
-
- a.save('文件路径.docx')
-
修改文档中所有文字的样式
- from docx.oxml.ns import qn
- from docx import Document
- from docx.shared import Pt, RGBColor # 字号,颜色
-
- a = Document()
- for 段落 in a.paragraphs:
- for 块 in 段落.runs:
- 块.font.name = 'Arial' # 文档中非中文内容的字体
- 块._element.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑') # 文档中中文字体的样式
- 块.font.bold = True # 加粗
- 块.font.italic = True # 斜体
- 块.font.underline = True # 下划线
- 块.font.strike = True # 删除线
- 块.font.shadow = True # 阴影
- 块.font.all_caps = True # 全部大写字母
- 块.font.size = Pt(24) # 24号字
- 块.font.color.rgb = RGBColor(255, 0, 0) # 颜色
- a.save('文件.docx')
-
指定修改文档中样式
- from docx.oxml.ns import qn
- from docx import Document
-
-
- a = Document()
- a.styles['Normal'].font.name = 'Arial' # 正文
- a.styles['Heading 1'].font.name = 'Arial' # 标题
-
- a.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑') # 文档中正文文字体的样式
- a.styles['Heading 1']._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') # 文档中标题文字体的样式
- a.save('文件.docx')
-
- from docx import Document
- from docx.enum.text import WD_ALIGN_PARAGRAPH
-
-
- a = Document()
- for i in a.paragraphs:
- if i.style.name == 'Normal': #如果想居中标题,这里改标题就行
- i.alignment = WD_ALIGN_PARAGRAPH.CENTER # LEFT(左) RIGHT(右) CENTER(中) JUSTIFY(两端) DISTRIBUTE(分散)
- a.save('文件.docx')
-
- from docx import Document
-
- a = Document()
- for i in a.paragraphs:
- if i.style.name == 'Normal': # 正文 (如果要改全部就取消if语句)
- i.paragraph_format.line_spacing = 9.0 # 行间距
- a.save('文件.docx')
-
正文整体缩进
- from docx import Document
- from docx.shared import Inches # pt磅 cm厘米 inches英寸 mm毫米
-
- a = Document()
- for i in a.paragraphs:
- if i.style.name == 'Normal': # 如果等于正文
- i.paragraph_format.left_indent = Inches(0.3) # 右缩进right_indent
- a.save('文件.docx')
-
首行缩进
- from docx import Document
- from docx.shared import Inches
-
- a = Document()
- for i in a.paragraphs:
- if i.style.name == 'Normal': # 如果等于正文
- i.paragraph_format.first_line_indent = Inches(0.3) # 首行缩进
- a.save('文件.docx')
-
悬挂缩进(首行不缩进,其余缩进)
- from docx import Document
- from docx.shared import Inches # pt磅 cm厘米 inches英寸 mm毫米
-
- a = Document()
- for i in a.paragraphs:
- if i.style.name == 'Normal': # 如果等于正文
- i.paragraph_format.first_line_indent = Inches(-0.3) # 改为负数即可悬挂缩进
- a.save('文件.docx')
-
查询空文档的节和段落
- from docx import Document
-
- a = Document()
- print(len(a.sections)) # 节
- print(len(a.paragraphs)) # 段落
-
添加节
- from docx import Document
-
- a = Document()
- print(len(a.sections)) # 节
- print(len(a.paragraphs)) # 段落
-
- a.add_section() # 添加一个节
- print(len(a.sections)) # 节
- print(len(a.paragraphs)) # 段落
-
每节中添加段落
- from docx import Document
-
- a = Document()
- print(len(a.sections)) # 节
- print(len(a.paragraphs)) # 段落
-
- a.add_section() # 添加一个节
- print(len(a.sections)) # 节
- print(len(a.paragraphs)) # 段落
-
- a.paragraphs[0].add_run('第一节中的块')
- a.paragraphs[1].add_run('第二节中的块')
- a.save('c:/18.docx')
-
对节的定位
- from docx import Document
-
- a = Document('文件路径')
- b = a.sections[0] # 切片
-
改分节符
- from docx import Document
- from docx.enum.section import WD_SECTION_START
-
- a = Document('路径')
- b = a.sections[1]
- print('改前分解符类型:', b.start_type) # 更改分节符之前
-
- b.start_type = WD_SECTION_START.ODD_PAGE # 奇数分节符
- print('改分节符之后:', b.start_type)
-
- # CONTINUOUS 连续分隔符
- # NEW_COLUMN 新列分隔符
- # NEW_PAGE 新页的分隔符
- # EVEN_PAGE 偶数页的分隔符
- # ODD_PAGE 奇数页的分隔符
-
装模块
- pip install comtypes
-
示例代码
- import os
- import comtypes.client
-
-
- def get_path():
- # 指定路径
- path = 'C:/word'
- # 获取所有文件名列表
- filename_list = os.listdir(path)
- # 获取所有word文件名列表
- wordname_list = [filename for filename in filename_list if filename.endswith(('doc', 'docx'))]
- for wordname in wordname_list:
- # 分离word文件名和后缀,转化为pdf名
- pafname = os.path.splitext(wordname)[0] + 'pdf'
- # 如果当前word文件对应的pdf文件存在,则不转化
- if pafname in filename_list:
- continue
- # 拼接路径和文件名
- wordpath = os.path.join(path, wordname)
- pdfpath = os.path.join(path, pafname)
- # 生成器
- yield wordpath, pdfpath
-
-
- def convert_word_to_pdf():
- word = comtypes.client.CreateObject('Word.Application')
- word.Visible = 0
- for wordpath, pdfpath in get_path():
- newpdf = word.Documents.Open(wordpath)
- newpdf.SaveAs(pdfpath, FileFormat=17)
- newpdf.Close()
-
-
- if __name__ == '__main__':
- convert_word_to_pdf()
-