analyze_formulas.py 11.5 KB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel公式分析工具
分析工资核算Excel文件中的公式和引用关系
"""

import pandas as pd
import openpyxl
import os
import re
from pathlib import Path

def extract_formulas_from_excel(file_path):
    """从Excel文件中提取所有公式"""
    try:
        # 使用openpyxl读取Excel文件以获取公式
        workbook = openpyxl.load_workbook(file_path, data_only=False)
        
        formulas = []
        references = []
        
        for sheet_name in workbook.sheetnames:
            worksheet = workbook[sheet_name]
            
            for row in worksheet.iter_rows():
                for cell in row:
                    if cell.data_type == 'f':  # 公式类型
                        formula = str(cell.value)
                        cell_ref = f"{sheet_name}!{cell.coordinate}"
                        
                        # 提取公式中的引用
                        refs = extract_references_from_formula(formula)
                        
                        formulas.append({
                            'file': os.path.basename(file_path),
                            'sheet': sheet_name,
                            'cell': cell.coordinate,
                            'formula': formula,
                            'references': refs
                        })
                        
                        references.extend(refs)
        
        return formulas, references
        
    except Exception as e:
        print(f"分析文件 {file_path} 时出错: {e}")
        return [], []

def extract_references_from_formula(formula):
    """从公式中提取引用关系"""
    references = []
    
    # 匹配各种引用模式
    patterns = [
        # XLOOKUP引用模式
        r"XLOOKUP\([^,]+,\s*'([^']+)'!([^,]+),\s*'([^']+)'!([^,]+)",
        # VLOOKUP引用模式
        r"VLOOKUP\([^,]+,\s*'([^']+)'!([^,]+)",
        # 直接工作表引用
        r"'([^']+)'!([A-Z]+\d+)",
        # 工作表范围引用
        r"'([^']+)'!([A-Z]+\d+:[A-Z]+\d+)",
        # 简单单元格引用
        r"([A-Z]+\d+)",
        # 范围引用
        r"([A-Z]+\d+:[A-Z]+\d+)",
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, formula)
        for match in matches:
            if isinstance(match, tuple):
                if len(match) == 2:
                    references.append({
                        'type': 'worksheet_reference',
                        'file': match[0] if match[0] else None,
                        'range': match[1]
                    })
                elif len(match) == 4:
                    references.append({
                        'type': 'xlookup_reference',
                        'file': match[0],
                        'lookup_range': match[1],
                        'file2': match[2],
                        'return_range': match[3]
                    })
            else:
                references.append({
                    'type': 'cell_reference',
                    'range': match
                })
    
    return references

def analyze_formula_dependencies(formulas):
    """分析公式依赖关系"""
    dependencies = {}
    
    for formula in formulas:
        cell_key = f"{formula['file']}!{formula['sheet']}!{formula['cell']}"
        dependencies[cell_key] = {
            'formula': formula['formula'],
            'dependencies': formula['references']
        }
    
    return dependencies

def generate_formula_report(formulas, references, dependencies):
    """生成公式分析报告"""
    report_path = "/Users/mr.wang/代码库/绿纤/lvqianmeiye_ERP/Excel公式分析报告.md"
    
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write("# Excel公式分析报告\n\n")
        f.write("## 概述\n\n")
        f.write("本报告分析了工资核算Excel文件中的公式和引用关系,帮助理解数据计算逻辑。\n\n")
        
        # 统计信息
        f.write("## 统计信息\n\n")
        f.write(f"- 总公式数量: {len(formulas)}\n")
        f.write(f"- 总引用数量: {len(references)}\n")
        f.write(f"- 涉及文件数量: {len(set(f['file'] for f in formulas))}\n\n")
        
        # 按文件分组分析
        files = {}
        for formula in formulas:
            file_name = formula['file']
            if file_name not in files:
                files[file_name] = []
            files[file_name].append(formula)
        
        f.write("## 文件公式分析\n\n")
        
        for file_name, file_formulas in files.items():
            f.write(f"### {file_name}\n\n")
            f.write(f"公式数量: {len(file_formulas)}\n\n")
            
            # 按工作表分组
            sheets = {}
            for formula in file_formulas:
                sheet_name = formula['sheet']
                if sheet_name not in sheets:
                    sheets[sheet_name] = []
                sheets[sheet_name].append(formula)
            
            for sheet_name, sheet_formulas in sheets.items():
                f.write(f"#### 工作表: {sheet_name}\n\n")
                f.write(f"公式数量: {len(sheet_formulas)}\n\n")
                
                # 显示前10个公式
                for i, formula in enumerate(sheet_formulas[:10]):
                    f.write(f"**{formula['cell']}**:\n")
                    f.write(f"```\n{formula['formula']}\n```\n")
                    
                    if formula['references']:
                        f.write("引用关系:\n")
                        for ref in formula['references']:
                            if ref['type'] == 'xlookup_reference':
                                f.write(f"- XLOOKUP: {ref['file']}!{ref['lookup_range']} -> {ref['file2']}!{ref['return_range']}\n")
                            elif ref['type'] == 'worksheet_reference':
                                f.write(f"- 工作表引用: {ref['file']}!{ref['range']}\n")
                            elif ref['type'] == 'cell_reference':
                                f.write(f"- 单元格引用: {ref['range']}\n")
                        f.write("\n")
                    else:
                        f.write("无外部引用\n\n")
                
                if len(sheet_formulas) > 10:
                    f.write(f"... 还有 {len(sheet_formulas) - 10} 个公式\n\n")
            
            f.write("---\n\n")
        
        # 引用关系汇总
        f.write("## 引用关系汇总\n\n")
        
        # 统计引用类型
        ref_types = {}
        for ref in references:
            ref_type = ref['type']
            if ref_type not in ref_types:
                ref_types[ref_type] = 0
            ref_types[ref_type] += 1
        
        f.write("### 引用类型统计\n\n")
        for ref_type, count in ref_types.items():
            f.write(f"- {ref_type}: {count} 个\n")
        f.write("\n")
        
        # 外部文件引用
        external_refs = {}
        for ref in references:
            if ref['type'] == 'xlookup_reference' and ref['file']:
                file_name = ref['file']
                if file_name not in external_refs:
                    external_refs[file_name] = []
                external_refs[file_name].append(ref)
        
        if external_refs:
            f.write("### 外部文件引用\n\n")
            for file_name, refs in external_refs.items():
                f.write(f"**{file_name}**: {len(refs)} 个引用\n")
                for ref in refs[:5]:  # 只显示前5个
                    f.write(f"- {ref['file']}!{ref['lookup_range']} -> {ref['file2']}!{ref['return_range']}\n")
                if len(refs) > 5:
                    f.write(f"... 还有 {len(refs) - 5} 个引用\n")
                f.write("\n")
        
        # 公式依赖关系图
        f.write("## 公式依赖关系\n\n")
        f.write("### 关键公式分析\n\n")
        
        # 找出包含XLOOKUP的公式
        xlookup_formulas = [f for f in formulas if 'XLOOKUP' in f['formula']]
        
        if xlookup_formulas:
            f.write("#### XLOOKUP公式分析\n\n")
            for formula in xlookup_formulas[:10]:
                f.write(f"**{formula['file']}!{formula['sheet']}!{formula['cell']}**:\n")
                f.write(f"```\n{formula['formula']}\n```\n")
                
                # 解析XLOOKUP参数
                xlookup_match = re.search(r"XLOOKUP\(([^,]+),\s*'([^']+)'!([^,]+),\s*'([^']+)'!([^,]+)", formula['formula'])
                if xlookup_match:
                    lookup_value = xlookup_match.group(1).strip()
                    lookup_file = xlookup_match.group(2)
                    lookup_range = xlookup_match.group(3)
                    return_file = xlookup_match.group(4)
                    return_range = xlookup_match.group(5)
                    
                    f.write("参数解析:\n")
                    f.write(f"- 查找值: {lookup_value}\n")
                    f.write(f"- 查找范围: {lookup_file}!{lookup_range}\n")
                    f.write(f"- 返回范围: {return_file}!{return_range}\n")
                f.write("\n")
        
        # 数据流分析
        f.write("### 数据流向分析\n\n")
        f.write("基于公式分析,数据流向如下:\n\n")
        
        # 分析数据源文件
        data_sources = set()
        for ref in references:
            if ref['type'] == 'xlookup_reference' and ref['file']:
                data_sources.add(ref['file'])
        
        f.write("**数据源文件**:\n")
        for source in sorted(data_sources):
            f.write(f"- {source}\n")
        f.write("\n")
        
        f.write("**数据流向**:\n")
        f.write("1. 基础数据从各数据源文件获取\n")
        f.write("2. 通过XLOOKUP/VLOOKUP函数进行数据匹配\n")
        f.write("3. 在工资核算表中进行汇总计算\n")
        f.write("4. 生成最终的工资计算结果\n\n")
    
    return report_path

def main():
    """主函数"""
    base_path = "/Users/mr.wang/代码库/绿纤/lvqianmeiye_ERP/参考资料/工资核算 -7月"
    
    # 要分析的文件列表
    files_to_analyze = [
        "工资(全字段).xlsx",
        "①B-a-26考勤汇总表.xlsx",
        "②B-a-17每日早报.xlsx", 
        "③B-a-③呈现-消耗明细表.xlsx",
        "④B-a-25社保统计表.xlsx",
        "⑤B-a-12奖励统计表.xlsx",
        "B-b-③健康师底薪.xlsx",
        "B-b-④健康师提成-金三角顾问.xlsx",
        "B-b-⑤其他岗位工资.xlsx",
        "B-b-⑤当月数据及门店毛利.xlsx"
    ]
    
    all_formulas = []
    all_references = []
    
    print("开始分析Excel公式...")
    print("=" * 60)
    
    for filename in files_to_analyze:
        file_path = os.path.join(base_path, filename)
        if os.path.exists(file_path):
            print(f"正在分析: {filename}")
            formulas, references = extract_formulas_from_excel(file_path)
            all_formulas.extend(formulas)
            all_references.extend(references)
            print(f"  发现 {len(formulas)} 个公式")
        else:
            print(f"文件不存在: {filename}")
    
    # 分析依赖关系
    dependencies = analyze_formula_dependencies(all_formulas)
    
    # 生成报告
    report_path = generate_formula_report(all_formulas, all_references, dependencies)
    
    print(f"\n公式分析完成!")
    print(f"总公式数量: {len(all_formulas)}")
    print(f"总引用数量: {len(all_references)}")
    print(f"分析报告: {report_path}")

if __name__ == "__main__":
    main()