Python处理PDF——pdfplumber的安装与使用
ttps://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md
# -*- coding:utf-8 -*-
"""
@Time :2023/XX/XX
@Auth :Stone
@File :parse_online_pdf.py
@DESC :在线解析PDF文档
"""
import requests
import pdfplumber
import re, time, os
def online_pdf_parse(path_or_url, mode=1, url_params=None, proxies=None, save_as=None):
'''
<语法>
参数path_or_url: PDF文档路径或者URL
参数mode: 设置解析模式,
[1, '1', 'text']返回文档内容 -> str
[2, '2', 'table']返回表格信息 -> list
[3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple
参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict
参数proxies: 读取在线PDF文档时,传入requests的代理
参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用
</语法>
'''
url_mode = False
# 判断是本地文档还是在线文档
if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url):
url_mode = True
else:
pdf_path = path_or_url
if url_mode:
pdf_url = path_or_url
headers_d = None
headers_d = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'}
if not proxies:
proxy_host = {}
if not url_params:
url_params = {}
url_params['headers'] = headers_d
url_params['data'] = None
url_params['params'] = None
url_params['proxies'] = proxies
if not url_params['headers']: url_params['headers'] = headers_d
if url_params['data'] or url_params['params']:
response = requests.post(pdf_url, **url_params)
else:
response = requests.get(pdf_url, **url_params)
# 写入临时文件再进行解析
pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf'
with open(pdf_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
pdf_path = os.path.abspath(pdf_path)
# 用pdfplumber对pdf文档进行解析
pdf_text = ''
pdf_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
if str(mode).lower() in ['1', 'text', '0', '3']:
pdf_text += str(page.extract_text())
if str(mode).lower() in ['2', 'table', '0', '3']:
pdf_tables += page.extract_tables()
# print(f"pdf_tables={pdf_tables}")
# print(f"*" * 166)
# 删除临时pdf文档
if url_mode and not save_as:
try:
os.remove(pdf_path)
except Exception as e:
pass
if str(mode).lower() in ['1', 'text']:
return pdf_text
elif str(mode).lower() in ['2', 'table']:
return pdf_tables
elif str(mode).lower() in ['3', 'text_and_table']:
return pdf_text, pdf_tables
def replace_str(str_font):
"""替换文字中的符号"""
str_font = str(str_font).replace('\n', '').replace(' ', '').replace(': ', ':')
return str_font
def link_last_list(need_list):
"""链接上一个list"""
result_total = []
for current_list in need_list:
if current_list[0] == '':
# 如果当前列表第一个值为空字符串
if result_total:
# 如果有上一个非空列表 list1=上一个列表,list2=当前列表,
new_list = []
for i in range(len(result_total[-1])):
new_value = str(result_total[-1][i]) + str(current_list[i])
new_list.append(new_value)
result_total[-1] = new_list
else:
result_total.append(current_list)
else:
result_total.append(current_list)
print(f"获取到所有数组合并后为={result_total}")
return result_total
if __name__ == '__main__':
pdf_url = f"********************************"
pdf_text = online_pdf_parse(pdf_url, mode='table')
# # print(f"获取的内容是={pdf_text}")
# 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行
data = []
for item in pdf_text:
for dd in item:
data.append([replace_str(str_item) for str_item in dd])
# print(f"all_list={data}")
result_list = link_last_list(data)
print(f"拼接后的数组为={result_list}")