python: 用百度API读取增值税发票信息
# encoding: utf-8 # 版权所有 2023 涂聚文有限公司 # 许可信息查看: # 描述: # Author : geovindu,Geovin Du 涂聚文. # IDE : PyCharm 2023.1 python 311 # Datetime : 2023/9/30 6:56 # User : geovindu # Product : PyCharm # Project : pythonTkinterDemo # File : BaiduOCRAPI.py # explain : 学习 ''' {'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560} Json {'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560} ''' import os import base64 import requests import pandas as pd import json class BaiduOCR(object): """ 利用百度API读取发票信息(pdf,image文件) """ AppID="40226401" APIKey="QuXMNizc80gTmUznKDRqQX3D" SecretKey="h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi" def __init__(self): """ """ self.AppID="40226401" self.APIKey="SeP54f3RO7GqifYpX8DPQGQE" #QuXMNizc80gTmUznKDRqQX3D self.SecretKey="c2zXHOWM2hlxeECEgwqG6UWlLTN1kQRs" #h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi def getAccessToken(self): """ :param APIKey: :param SecretKey: :return: """ ''' host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}" response = requests.get(host) return response.json()['access_token'] ''' url = "https://aip.baidubce.com/oauth/2.0/token" params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey} return str(requests.post(url, params=params).json().get("access_token")) def getContent(self,accessToken, pdfFile): """ :param accessToken :param pdfFile: :return: """ #headers = {'content-type': 'application/x-www-form-urlencoded'} #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}" f = open(pdfFile, 'rb') pdf = base64.b64encode(f.read()) print(pdf) print(accessToken) request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice" params = {"pdf_file": pdf} access_token =accessToken # '[调用鉴权接口获取的token]' request_url = request_url + "?access_token=" + access_token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url, data=params, headers=headers) if response: print(response.json()) #print(pdf) #params = {"pdf_file": pdf} #response = requests.post(request_url, data=params, headers=headers) #print(response.json()) return response.json() def getContentPng(self,accessToken, pngFile): """ :param accessToken :param pngFile: :return: """ #headers = {'content-type': 'application/x-www-form-urlencoded'} #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}" f = open(pngFile, 'rb') pdf = base64.b64encode(f.read()) print(pdf) print(accessToken) request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice" params = {"image": pdf} access_token =accessToken # '[调用鉴权接口获取的token]' request_url = request_url + "?access_token=" + access_token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url, data=params, headers=headers) if response: print(response.json()) #print(pdf) #params = {"pdf_file": pdf} #response = requests.post(request_url, data=params, headers=headers) #print(response.json()) return response.json() def getUsefulInfo(self,content, pdfName): """ :param content :param pdfName: :return: """ jsonstr = content print("Json",jsonstr) words_result = jsonstr['words_result'] info = {'发票文件名': pdfName, '发票号码': str(words_result['InvoiceNum']), '开票日期': words_result['InvoiceDate'], '货物名称': words_result['CommodityName'][0]['word'], '未税金额': words_result['CommodityAmount'][0]['word'], '货物税率': words_result['CommodityTaxRate'][0]['word'], '货物税额': words_result['CommodityTax'][0]['word'], '合计金额': words_result['TotalAmount'], '合计税额': words_result['TotalTax'], '价税合计(小写)': words_result['AmountInFiguers'], '价税合计(大写)': words_result['AmountInWords'], '销售方名称': words_result['SellerName'], '销售方纳税人识别号': words_result['SellerRegisterNum'], '销售方银行及账户': words_result['SellerBank'], '销售方地址及电话': words_result['SellerAddress'], '购买方名称':words_result['PurchaserName'], '购买方纳税人识别号':words_result['PurchaserRegisterNum'], '机器编号':words_result['MachineCode'] } return info
调用:用京东多张发票测试成功
ocr=Common.BaiduOCRAPI.BaiduOCR() pdfFilelist = os.listdir("invoice/") infolist = [] for pdfFile in pdfFilelist: if pdfFile.split(".")[-1] == 'pdf': pdfName = pdfFile.split(".")[:-1] print(pdfFile) access_token =ocr.getAccessToken() content = ocr.getContent(access_token, "invoice/" + pdfFile) info = ocr.getUsefulInfo(content, pdfName) infolist.append(info) df = pd.DataFrame(infolist) print(df) #df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False) with pd.ExcelWriter('geovindu.xlsx') as writer: #, mode='a' 附加 df.to_excel(writer, sheet_name='geovindu', index=False)
哲学管理(学)人生, 文学艺术生活, 自动(计算机学)物理(学)工作, 生物(学)化学逆境, 历史(学)测绘(学)时间, 经济(学)数学金钱(理财), 心理(学)医学情绪, 诗词美容情感, 美学建筑(学)家园, 解构建构(分析)整合学习, 智商情商(IQ、EQ)运筹(学)生存.---Geovin Du(涂聚文)