python实现表格图片转excel格式文件,excel表格取数据,表格PDF取数据
def Image_Excel(APP_ID,API_KEY,SECRET_KEY,img_response_body):
# 调用百度AI接口
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
# 循环遍历文件家中图片
# image=r'C:\Users\Administrator\Desktop\rhinoweCode\XnSpider\XnSpider\图片识别表格.png'#get_image(image_path)
# # for image in images:
# # 以二进制方式打开图片
# img_open=open(image,'rb')
# # 读取图片
# img_read = img_open.read()
# 调用表格识别模块识别图片
table = client.tableRecognitionAsync(img_response_body)
# 获取请求ID
request_id = table['result'][0]['request_id']
#获取表格处理结果
result = client.getTableRecognitionResult(request_id)
# 处理状态是“已完成”,获取下载地址
# print(result)
while result['result']['ret_msg'] != '已完成':
time.sleep(2) # 暂停2秒再刷新
result = client.getTableRecognitionResult(request_id)
download_url = result['result']['result_data']
return download_url
# print(download_url)
# # 获取表格数据
# excel_data = requests.get(download_url)
# # 根据图片名字命名表格名称
# xlsx_name = "表"+ ".xlsx"#image.split(".")[0] + ".xlsx"
# # 新建excel文件
# xlsx = open(xlsx_name, 'wb')
# # 将数据写入excel文件并保存
# xlsx.write(excel_data.content)
#在开始前,需要检查图片的大小,必须在1K~4M之间,不然会报错“‘error_msg': ‘image size error, image is too big or too small, upper limit 4M, lower limit 1k, please check your param'”。
#ln
def img_2_table(img_read):
table=Image_Excel(APP_ID,API_KEY,SECRET_KEY,img_read)
return table
def get_excel_table(url, index=None, method=None, cols=None, keys=None, header=None, headers=USER_AGENT):#headers={'User-Agent': "pandas"}):#
"""
传入excel的url地址,来获取excel中的表格数据,返回Json类型数据.
数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]
返回的时间数据可能为字符串(2021-8-25)或者时间戳(int)
"""
# # 显示所有列
print(url, '正在处理...')
# headers = {'User-Agent': "pandas"}
excelDataFrames = pd.read_excel(url, sheet_name=None, header=header, storage_options=headers) # index-1
excelTableJson = {}
# pprint(excelDataFrames)
for tableName, excelDataFrame in excelDataFrames.items():
excelDataFrame.replace(r'\s+|\n|\t', '', regex=True, inplace=True)
if list(excelDataFrame.values):
if keys:
index = get_table_index(excelDataFrame, keys)
if index != None:
if index != -1:
excelDataFrame = change_df(df=excelDataFrame, index=index, method=method, cols=cols) # , redu=1, index-1 ->: header=None
# print(excelDataFrame)
elif index == -1 and header:
excelDataFrame = change_df(df=excelDataFrame, index=index, header=header, method=method, cols=cols)
excelJson = json.loads(excelDataFrame.to_json(orient='records'))
# excelJson = [json.loads(excelData.to_json(orient='records'))[0] for excelData in excelDataFrame]
excelTableJson[tableName] = excelJson
# print(excelTableJson)
else:
excelJson = json.loads(excelDataFrame.to_json(orient='records'))
excelTableJson[tableName] = excelJson
# pprint(excelTableJson)
print(url, '处理完毕!!!')
return excelTableJson
http://www.xa.gov.cn/web_files/file/2023/01/11/202301111459070356479.pdf 加上headers
import pdfplumber
from PyPDF2 import PdfFileReader, PdfReader
from fake_useragent import UserAgent
USER_AGENT = UserAgent().random
def get_pdf_table1(url, headers={"User-Agent": USER_AGENT}):
"""
传入pdf的url地址,来获取pdf中的表格数据,返回Json类型数据.
数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]`
"""
print(url, '正在处理...')
# 获取pdf页数
if url.startswith("http"):
a = requests.get(url, headers=headers).content
else:
a = open(url, "rb").read()
# fp = tempfile.TemporaryFile()
# fp.write(a)
with open('test.pdf', 'wb', ) as f:
f.write(a)
# 读取pdf文件,保存为pdf实例
pdf = pdfplumber.open("./test.pdf")
# 访问第二页
first_page = pdf.pages[0]
# print(first_page)
# 自动读取表格信息,返回列表
table = first_page.extract_table()
# print(table)
# table
df = pd.DataFrame(table[2:], columns=table[1])# 第一行为空 (table[2:], columns=table[1]) 第一行是表列名(table[1:], columns=table[0])
df=df[df.columns[1:]]
df.columns=[i.replace("\n","") for i in df.columns]
# print(df)
result = df.to_dict(orient="records")
# print(result)
return result#table
#这里url必须是英文
get_pdf_table1(link_response.urljoin(parse.quote(img_url)), headers={})
excelJsonDict = get_pdf_table1(, headers={})
备用方法
def get_pdf_table(url, headers=None):
"""
传入pdf的url地址,来获取pdf中的表格数据,返回Json类型数据.
数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]`
"""
if headers is None:
headers = {}
headers.update({"User-Agent": USER_AGENT})
print(url, '正在处理...')
# 获取pdf页数
a = requests.get(url, headers=headers).content
fp = tempfile.TemporaryFile()
fp.write(a)
reader = PdfFileReader(fp)#PdfMerger
# 不解密可能会报错:PyPDF2.utils.PdfReadError: File has not been decrypted
# if reader.isEncrypted:
# reader.decrypt('')
page = reader.getNumPages()
pages = ",".join([str(i) for i in range(1, page + 1)])
pdfs = camelot.read_pdf(url, pages=pages, flavor='stream', headers=headers)
pdfDataFrame = pd.DataFrame()
for pdf in pdfs:
pdfData = pdf.df
pdfData = pdfData[~pdfData[2].isin([''])]
pdfDataFrame = pdfDataFrame.append(pdfData)
pdfDataFrame = change_df(pdfDataFrame, 0)
print(pdfDataFrame.columns[0])
# print(pdfDataFrame)#960629
pdfDataFrame = pdfDataFrame[~pdfDataFrame[pdfDataFrame.columns[0]].isin([pdfDataFrame.columns[0]])]
# pdfDataFrame.to_csv('1.csv', encoding='utf_8_sig')
pdfDataFrame = pdfDataFrame.fillna(method='pad')
pdfJson = json.loads(pdfDataFrame.to_json(orient='records'))
print(url, '处理完毕!!!')
return pdfJson