python实现表格图片转excel格式文件,excel表格取数据,表格PDF取数据
-
-
- def Image_Excel(APP_ID,API_KEY,SECRET_KEY,img_response_body):
- # 调用百度AI接口
- client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
- # 循环遍历文件家中图片
- # image=r'C:\Users\Administrator\Desktop\rhinoweCode\XnSpider\XnSpider\图片识别表格.png'#get_image(image_path)
- # # for image in images:
- # # 以二进制方式打开图片
- # img_open=open(image,'rb')
- # # 读取图片
- # img_read = img_open.read()
- # 调用表格识别模块识别图片
- table = client.tableRecognitionAsync(img_response_body)
- # 获取请求ID
- request_id = table['result'][0]['request_id']
- #获取表格处理结果
- result = client.getTableRecognitionResult(request_id)
- # 处理状态是“已完成”,获取下载地址
- # print(result)
- while result['result']['ret_msg'] != '已完成':
- time.sleep(2) # 暂停2秒再刷新
- result = client.getTableRecognitionResult(request_id)
- download_url = result['result']['result_data']
- return download_url
- # print(download_url)
- # # 获取表格数据
- # excel_data = requests.get(download_url)
- # # 根据图片名字命名表格名称
- # xlsx_name = "表"+ ".xlsx"#image.split(".")[0] + ".xlsx"
- # # 新建excel文件
- # xlsx = open(xlsx_name, 'wb')
- # # 将数据写入excel文件并保存
- # xlsx.write(excel_data.content)
-
- #在开始前,需要检查图片的大小,必须在1K~4M之间,不然会报错“‘error_msg': ‘image size error, image is too big or too small, upper limit 4M, lower limit 1k, please check your param'”。
-
-
- #ln
- def img_2_table(img_read):
- table=Image_Excel(APP_ID,API_KEY,SECRET_KEY,img_read)
- return table
-
-
- def get_excel_table(url, index=None, method=None, cols=None, keys=None, header=None, headers=USER_AGENT):#headers={'User-Agent': "pandas"}):#
- """
- 传入excel的url地址,来获取excel中的表格数据,返回Json类型数据.
- 数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]
- 返回的时间数据可能为字符串(2021-8-25)或者时间戳(int)
- """
- # # 显示所有列
- print(url, '正在处理...')
- # headers = {'User-Agent': "pandas"}
- excelDataFrames = pd.read_excel(url, sheet_name=None, header=header, storage_options=headers) # index-1
- excelTableJson = {}
- # pprint(excelDataFrames)
- for tableName, excelDataFrame in excelDataFrames.items():
- excelDataFrame.replace(r'\s+|\n|\t', '', regex=True, inplace=True)
- if list(excelDataFrame.values):
- if keys:
- index = get_table_index(excelDataFrame, keys)
- if index != None:
- if index != -1:
- excelDataFrame = change_df(df=excelDataFrame, index=index, method=method, cols=cols) # , redu=1, index-1 ->: header=None
- # print(excelDataFrame)
- elif index == -1 and header:
- excelDataFrame = change_df(df=excelDataFrame, index=index, header=header, method=method, cols=cols)
- excelJson = json.loads(excelDataFrame.to_json(orient='records'))
- # excelJson = [json.loads(excelData.to_json(orient='records'))[0] for excelData in excelDataFrame]
- excelTableJson[tableName] = excelJson
- # print(excelTableJson)
-
- else:
- excelJson = json.loads(excelDataFrame.to_json(orient='records'))
- excelTableJson[tableName] = excelJson
- # pprint(excelTableJson)
- print(url, '处理完毕!!!')
- return excelTableJson
-
http://www.xa.gov.cn/web_files/file/2023/01/11/202301111459070356479.pdf 加上headers
- import pdfplumber
- from PyPDF2 import PdfFileReader, PdfReader
- from fake_useragent import UserAgent
-
- USER_AGENT = UserAgent().random
-
-
- def get_pdf_table1(url, headers={"User-Agent": USER_AGENT}):
- """
- 传入pdf的url地址,来获取pdf中的表格数据,返回Json类型数据.
- 数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]`
- """
- print(url, '正在处理...')
- # 获取pdf页数
- if url.startswith("http"):
- a = requests.get(url, headers=headers).content
- else:
- a = open(url, "rb").read()
- # fp = tempfile.TemporaryFile()
- # fp.write(a)
- with open('test.pdf', 'wb', ) as f:
- f.write(a)
- # 读取pdf文件,保存为pdf实例
- pdf = pdfplumber.open("./test.pdf")
-
- # 访问第二页
- first_page = pdf.pages[0]
- # print(first_page)
-
- # 自动读取表格信息,返回列表
- table = first_page.extract_table()
- # print(table)
-
- # table
- df = pd.DataFrame(table[2:], columns=table[1])# 第一行为空 (table[2:], columns=table[1]) 第一行是表列名(table[1:], columns=table[0])
- df=df[df.columns[1:]]
- df.columns=[i.replace("\n","") for i in df.columns]
- # print(df)
- result = df.to_dict(orient="records")
- # print(result)
- return result#table
-
- #这里url必须是英文
-
- get_pdf_table1(link_response.urljoin(parse.quote(img_url)), headers={})
-
excelJsonDict = get_pdf_table1(, headers={})
备用方法
- def get_pdf_table(url, headers=None):
- """
- 传入pdf的url地址,来获取pdf中的表格数据,返回Json类型数据.
- 数据格式:[{第一行kv键值对},{第二行kv键值对},{},...]`
- """
- if headers is None:
- headers = {}
- headers.update({"User-Agent": USER_AGENT})
- print(url, '正在处理...')
- # 获取pdf页数
- a = requests.get(url, headers=headers).content
- fp = tempfile.TemporaryFile()
- fp.write(a)
- reader = PdfFileReader(fp)#PdfMerger
- # 不解密可能会报错:PyPDF2.utils.PdfReadError: File has not been decrypted
- # if reader.isEncrypted:
- # reader.decrypt('')
- page = reader.getNumPages()
- pages = ",".join([str(i) for i in range(1, page + 1)])
- pdfs = camelot.read_pdf(url, pages=pages, flavor='stream', headers=headers)
- pdfDataFrame = pd.DataFrame()
- for pdf in pdfs:
- pdfData = pdf.df
- pdfData = pdfData[~pdfData[2].isin([''])]
- pdfDataFrame = pdfDataFrame.append(pdfData)
- pdfDataFrame = change_df(pdfDataFrame, 0)
- print(pdfDataFrame.columns[0])
- # print(pdfDataFrame)#960629
- pdfDataFrame = pdfDataFrame[~pdfDataFrame[pdfDataFrame.columns[0]].isin([pdfDataFrame.columns[0]])]
- # pdfDataFrame.to_csv('1.csv', encoding='utf_8_sig')
- pdfDataFrame = pdfDataFrame.fillna(method='pad')
- pdfJson = json.loads(pdfDataFrame.to_json(orient='records'))
- print(url, '处理完毕!!!')
- return pdfJson
-
-