利用百度 AI 开发平台的 OCR 文字识别 API 识别并提取图片中的文字。首先需注册获取 API 调用的 ID 和 key,步骤如下:
打开百度AI开放平台,进入控制台中的文字识别应用(需要有百度账号)。
创建一个应用,并进入管理应用,记下 AppID, API Key, Secrect Key,调用 API需用到。
py2,py3均适用
- # coding=utf-8
- import sys,time,json,base64,fitz
- # 保证兼容python2以及python3
- IS_PY3 = sys.version_info.major == 3
- if IS_PY3:
- from urllib.request import urlopen,Request
- from urllib.error import URLError
- from urllib.parse import urlencode,quote_plus
- else:
- import urllib2
- from urllib import quote_plus,urlencode
- from urllib2 import urlopen,URLError,Request
- # 防止https证书校验不正确
- import ssl
- ssl._create_default_https_context = ssl._create_unverified_context
-
-
- class IMG_OCR():
-
- def __init__(self):
- self.API_KEY = 'aq'
- self.SECRET_KEY = '1u'
- self.OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
- """TOKEN start """
- self.TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
- # 获取access token
- self.token = self.fetch_token()
- # 拼接通用文字识别高精度url
- self.image_url = self.OCR_URL + "?access_token=" + self.token
-
- """获取token"""
- def fetch_token(self):
- params = {
- 'grant_type': 'client_credentials',
- 'client_id': self.API_KEY,
- 'client_secret': self.SECRET_KEY
- }
- post_data = urlencode(params)
- if (IS_PY3):
- post_data = post_data.encode('utf-8')
- req = Request(self.TOKEN_URL, post_data)
- try:
- f = urlopen(req, timeout=5)
- result_str = f.read()
- except URLError as err:
- print(err)
- if (IS_PY3):
- result_str = result_str.decode()
-
- result = json.loads(result_str)
- if ('access_token' in result.keys() and 'scope' in result.keys()):
- if not 'brain_all_scope' in result['scope'].split(' '):
- print('please ensure has check the ability')
- exit()
- return result['access_token']
- else:
- print('please overwrite the correct API_KEY and SECRET_KEY')
- exit()
-
- """读取文件"""
- def read_file(self,image_path):
- try:
- with open(image_path, 'rb') as f:
- return f.read()
- except:
- print('read image file fail')
- return None
-
- """调用远程服务"""
- def request(self,url, data):
- req = Request(url, data.encode('utf-8'))
- has_error = False
- try:
- f = urlopen(req)
- result_str = f.read()
- if (IS_PY3):
- result_str = result_str.decode()
- return result_str
- except URLError as err:
- print(err)
-
- def main(self,file_content):
- text = []
- # 调用文字识别服务
- result = self.request(self.image_url, urlencode({'image': base64.b64encode(file_content)}))
- # 解析返回结果
- result_json = json.loads(result)
- print(result_json)
- for words_result in result_json["words_result"]:
- text.append(words_result["words"])
- # 打印文字
- # print(text)
- return text
-
-
- class PDF_OCR():
- def __init__(self):
- self.img_ocr=IMG_OCR()
-
- '''
- # 将PDF转化为图片
- pdfPath pdf文件的路径
- imgPath 图像要保存的文件夹
- zoom_x x方向的缩放系数
- zoom_y y方向的缩放系数
- rotation_angle 旋转角度
- '''
- def pdf_image(self,pdfPath, imgPath, zoom_x=5, zoom_y=5, rotation_angle=0):
- # 打开PDF文件
- pdf = fitz.open(pdfPath)
- page_count=pdf.pageCount
- # 逐页读取PDF
- for pg in range(0, page_count):
- page = pdf[pg]
- # 设置缩放和旋转系数
- trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotation_angle)
- pm = page.getPixmap(matrix=trans, alpha=False)
- # 开始写图像
- pm.writePNG(imgPath + str(pg) + ".jpg")
- pdf.close()
-
- return page_count
-
- def main(self,pdfPath,imgPath):
- page_count=self.pdf_image(pdfPath, imgPath)
- text_list=[]
- for page in range(0,page_count):
- file_content = self.img_ocr.read_file(imgPath+'%s.jpg'%page)
- text=self.img_ocr.main(file_content)
- text_list=text_list+text
- return text_list
-
-
- if __name__ == '__main__':
- img_ocr=IMG_OCR()
- # 读取书籍页面图片
- # file_content = img_ocr.read_file('./c33.jpg')
- file_content = img_ocr.read_file('./zlgl_img/2020-01-10.jpg')
- text=img_ocr.main(file_content)
- print(text)
-
安装 python 的百度ai接口的的库
- pip install baidu-aip
-
以下是代码实现,需将所有识别的图片放进名为 picture 的文件夹。
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 利用百度api实现图片文本识别
- @author:
- """
-
- import glob
- from os import path
- import os
- from aip import AipOcr
- from PIL import Image
-
- def convertimg(picfile, outdir):
- '''调整图片大小,对于过大的图片进行压缩
- picfile: 图片路径
- outdir: 图片输出路径
- '''
- img = Image.open(picfile)
- width, height = img.size
- while(width*height > 4000000): # 该数值压缩后的图片大约 两百多k
- width = width // 2
- height = height // 2
- new_img=img.resize((width, height),Image.BILINEAR)
- new_img.save(path.join(outdir,os.path.basename(picfile)))
-
- def baiduOCR(picfile, outfile):
- """利用百度api识别文本,并保存提取的文字
- picfile: 图片文件名
- outfile: 输出文件
- """
- filename = path.basename(picfile)
-
- APP_ID = '******' # 刚才获取的 ID,下同
- API_KEY = '******'
- SECRECT_KEY = '******'
- client = AipOcr(APP_ID, API_KEY, SECRECT_KEY)
-
- i = open(picfile, 'rb')
- img = i.read()
- print("正在识别图片:\t" + filename)
- message = client.basicGeneral(img) # 通用文字识别,每天 50 000 次免费
- #message = client.basicAccurate(img) # 通用文字高精度识别,每天 800 次免费
- print("识别成功!")
- i.close();
-
- with open(outfile, 'a+') as fo:
- fo.writelines("+" * 60 + '\n')
- fo.writelines("识别图片:\t" + filename + "\n" * 2)
- fo.writelines("文本内容:\n")
- # 输出文本内容
- for text in message.get('words_result'):
- fo.writelines(text.get('words') + '\n')
- fo.writelines('\n'*2)
- print("文本导出成功!")
- print()
-
- if __name__ == "__main__":
-
- outfile = 'export.txt'
- outdir = 'tmp'
- if path.exists(outfile):
- os.remove(outfile)
- if not path.exists(outdir):
- os.mkdir(outdir)
- print("压缩过大的图片...")
- // 首先对过大的图片进行压缩,以提高识别速度,将压缩的图片保存与临时文件夹中
- for picfile in glob.glob("picture/*"):
- convertimg(picfile, outdir)
- print("图片识别...")
- for picfile in glob.glob("tmp/*"):
- baiduOCR(picfile, outfile)
- os.remove(picfile)
- print('图片文本提取结束!文本输出结果位于 %s 文件中。' % outfile)
- os.removedirs(outdir)
-
github:https://github.com/Baidu-AIP/QuickStart