要批量提取增值税发票的关键信息并将其导出为 Excel 文件-数字梦境

注意：需要把发票文件放到当前目录的《发票》文件夹里面如下图：
使用python来运行以下代码即可：
import pandas as pd
import numpy as np
import base64
import os
import re
import glob
import platform
import json
import logging
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models

# 配置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 判断操作系统 Mac 还是 Win，并设置路径分隔符
if platform.system() == 'Windows':
    chr = "\\"
elif platform.system() == 'Darwin':
    chr = '/'
else:
    chr = '/'

# 腾讯云API密钥（需要自行申请）
# 腾讯云OCR介绍https://cloud.tencent.com/document/api/866/36210
SecretId = '你的腾讯云API密钥'
SecretKey = '你的腾讯云API密钥'

# 设置发票文件夹路径为当前目录下的“发票”文件夹
base_folder = os.path.join(os.getcwd(), '发票')

# 检查文件夹是否存在
if not os.path.exists(base_folder):
    logging.error(f"The folder '{base_folder}' does not exist.")
    raise FileNotFoundError(f"The folder '{base_folder}' does not exist.")

# 获取发票文件列表
list_invoice_file = glob.glob(os.path.join(base_folder, '*.pdf'))
if not list_invoice_file:
    logging.error(f"No PDF files found in '{base_folder}'.")
    raise FileNotFoundError(f"No PDF files found in '{base_folder}'.")

# 定义处理发票的函数
def detail_invoice(path_file, n):
    logging.info(f"Processing file {path_file}")
    
    # 读取PDF文件并进行Base64编码
    with open(path_file, "rb") as pdf_file:
        encoded_string = base64.b64encode(pdf_file.read())
    base64string = str(encoded_string, 'utf-8')

    # 存储发票信息的字典
    dirc = {}

    try:
        # 设置腾讯云API的凭证
        cred = credential.Credential(SecretId, SecretKey)
        httpProfile = HttpProfile()
        httpProfile.endpoint = "ocr.tencentcloudapi.com"

        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        client = ocr_client.OcrClient(cred, "ap-beijing", clientProfile)

        # 设置请求参数
        req = models.VatInvoiceOCRRequest()
        params = {
            "ImageBase64": base64string,
            "IsPdf": True,
            "PdfPageNumber": 1
        }
        req.from_json_string(json.dumps(params))

        logging.info("Sending request to Tencent Cloud OCR API")
        resp = client.VatInvoiceOCR(req)
        logging.info("Received response from Tencent Cloud OCR API")

        # 解析响应数据
        reinfo = json.loads(resp.to_json_string())

        # 处理发票信息
        for i in reinfo["VatInvoiceInfos"]:
            count = 0
            name = i['Name']
            nameraw = i['Name']
            value = i['Value']
            if str(name) in dirc:
                while str(name) in dirc:
                    count += 1
                    name = nameraw + str(count)
                dirc[name] = value
            else:
                dirc[name] = value

        # 提取税率信息
        tax_rate = None
        for item in reinfo.get("Items", []):
            tax_rate = item.get("TaxRate", None)
            if tax_rate:
                break

        if tax_rate:
            dirc['税率'] = tax_rate

        # 将字典转换为DataFrame并添加到列表
        df_detail = pd.DataFrame.from_dict(dirc, orient='index')
        road.append(df_detail)

        # 检查并处理关键信息
        invoice_date = dirc.get('开票日期', '未知日期')
        seller_name = dirc.get('销售方名称', '未知销售方')
        amount_lowercase = dirc.get('价税合计(小写)', '未知金额')
        invoice_number = dirc.get('发票号码', f'未知号码_{n}')

        # 重命名文件
        ReFileName = f"{invoice_date}_{seller_name}_{amount_lowercase}_{invoice_number}__{n}.pdf"
        os.rename(path_file, os.path.join(base_folder, ReFileName))
        logging.info(f"Renamed file to {ReFileName}")

    except TencentCloudSDKException as err:
        logging.error(f"Error processing file {path_file}: {err}")
        print(err)

# 准备处理发票
road = []
n = 0
logging.info(f"Starting to process {len(list_invoice_file)} files")
for i in range(len(list_invoice_file)):
    path_file = list_invoice_file[i]
    n += 1
    detail_invoice(path_file, n)

# 信息合并
logging.info("Combining processed information into final DataFrame")
df_detail_sheet = pd.concat(road, ignore_index=True, axis=1)

# 表格微调
df1 = df_detail_sheet.T
df1['re金额'] = df1['价税合计(小写)'].apply(lambda x: float(re.search("\d+(\.\d+)?", x).group()) if pd.notnull(x) else 0)

# 数据导出
output_file = '发票信息.xlsx'
logging.info(f"Exporting data to {output_file}")
df1.to_excel(output_file, index=False)
logging.info("Processing complete")
运行后效果如下：
文章版权归作者所有，未经允许请勿转载。
THE END
网络教程
# 文件 # 要批量提取增值税发票的关键信息并将其导出为 # Excel