Python处理PDF——pdfplumber的安装与使用
ttps://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md
- # -*- coding:utf-8 -*-
-
- """
- @Time :2023/XX/XX
- @Auth :Stone
- @File :parse_online_pdf.py
- @DESC :在线解析PDF文档
- """
- import requests
- import pdfplumber
- import re, time, os
-
-
- def online_pdf_parse(path_or_url, mode=1, url_params=None, proxies=None, save_as=None):
- '''
- <语法>
- 参数path_or_url: PDF文档路径或者URL
- 参数mode: 设置解析模式,
- [1, '1', 'text']返回文档内容 -> str
- [2, '2', 'table']返回表格信息 -> list
- [3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple
- 参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict
- 参数proxies: 读取在线PDF文档时,传入requests的代理
- 参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用
- </语法>
- '''
-
- url_mode = False
-
- # 判断是本地文档还是在线文档
- if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url):
- url_mode = True
- else:
- pdf_path = path_or_url
-
- if url_mode:
- pdf_url = path_or_url
- headers_d = None
- headers_d = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'}
- if not proxies:
- proxy_host = {}
- if not url_params:
- url_params = {}
- url_params['headers'] = headers_d
- url_params['data'] = None
- url_params['params'] = None
- url_params['proxies'] = proxies
- if not url_params['headers']: url_params['headers'] = headers_d
- if url_params['data'] or url_params['params']:
- response = requests.post(pdf_url, **url_params)
- else:
- response = requests.get(pdf_url, **url_params)
-
- # 写入临时文件再进行解析
- pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf'
- with open(pdf_path, 'wb') as f:
- for chunk in response.iter_content(chunk_size=1024):
- if chunk:
- f.write(chunk)
- f.flush()
-
- pdf_path = os.path.abspath(pdf_path)
-
- # 用pdfplumber对pdf文档进行解析
- pdf_text = ''
- pdf_tables = []
- with pdfplumber.open(pdf_path) as pdf:
- for page in pdf.pages:
- if str(mode).lower() in ['1', 'text', '0', '3']:
- pdf_text += str(page.extract_text())
- if str(mode).lower() in ['2', 'table', '0', '3']:
- pdf_tables += page.extract_tables()
- # print(f"pdf_tables={pdf_tables}")
- # print(f"*" * 166)
-
- # 删除临时pdf文档
- if url_mode and not save_as:
- try:
- os.remove(pdf_path)
- except Exception as e:
- pass
-
- if str(mode).lower() in ['1', 'text']:
- return pdf_text
- elif str(mode).lower() in ['2', 'table']:
- return pdf_tables
- elif str(mode).lower() in ['3', 'text_and_table']:
- return pdf_text, pdf_tables
-
-
- def replace_str(str_font):
- """替换文字中的符号"""
- str_font = str(str_font).replace('\n', '').replace(' ', '').replace(': ', ':')
- return str_font
-
-
- def link_last_list(need_list):
- """链接上一个list"""
- result_total = []
- for current_list in need_list:
- if current_list[0] == '':
- # 如果当前列表第一个值为空字符串
- if result_total:
- # 如果有上一个非空列表 list1=上一个列表,list2=当前列表,
- new_list = []
- for i in range(len(result_total[-1])):
- new_value = str(result_total[-1][i]) + str(current_list[i])
- new_list.append(new_value)
- result_total[-1] = new_list
- else:
- result_total.append(current_list)
- else:
- result_total.append(current_list)
- print(f"获取到所有数组合并后为={result_total}")
- return result_total
-
-
- if __name__ == '__main__':
- pdf_url = f"********************************"
- pdf_text = online_pdf_parse(pdf_url, mode='table')
- # # print(f"获取的内容是={pdf_text}")
- # 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行
- data = []
- for item in pdf_text:
- for dd in item:
- data.append([replace_str(str_item) for str_item in dd])
- # print(f"all_list={data}")
- result_list = link_last_list(data)
- print(f"拼接后的数组为={result_list}")