analyze_excel.py 8.04 KB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel工资核算文件分析工具
分析工资核算相关的Excel文件,提取字段信息和引用关系
"""

import pandas as pd
import os
import re
from pathlib import Path

def analyze_excel_file(file_path, sheet_name=None):
    """分析单个Excel文件"""
    try:
        # 获取所有工作表名称
        excel_file = pd.ExcelFile(file_path)
        sheets = excel_file.sheet_names
        
        print(f"\n=== 分析文件: {os.path.basename(file_path)} ===")
        print(f"工作表数量: {len(sheets)}")
        print(f"工作表名称: {sheets}")
        
        analysis_result = {
            'file_name': os.path.basename(file_path),
            'sheets': {},
            'all_columns': set(),
            'formulas': [],
            'references': []
        }
        
        # 分析每个工作表
        for sheet in sheets:
            if sheet_name and sheet != sheet_name:
                continue
                
            try:
                df = pd.read_excel(file_path, sheet_name=sheet, header=0)
                print(f"\n--- 工作表: {sheet} ---")
                print(f"行数: {len(df)}")
                print(f"列数: {len(df.columns)}")
                print(f"列名: {list(df.columns)}")
                
                # 收集列名
                analysis_result['all_columns'].update(df.columns)
                analysis_result['sheets'][sheet] = {
                    'rows': len(df),
                    'columns': len(df.columns),
                    'column_names': list(df.columns),
                    'sample_data': df.head(3).to_dict('records') if len(df) > 0 else []
                }
                
                # 查找公式和引用
                for col in df.columns:
                    for idx, value in df[col].items():
                        if pd.notna(value) and isinstance(value, str):
                            # 查找Excel公式模式
                            formula_patterns = [
                                r'=([A-Z]+[0-9]+)',  # 单元格引用
                                r'=([A-Z]+[0-9]+:[A-Z]+[0-9]+)',  # 范围引用
                                r'=([A-Z]+[0-9]+\[[^\]]+\])',  # 数组引用
                                r'=([A-Z]+[0-9]+\$[0-9]+)',  # 绝对引用
                                r'=([A-Z]+[0-9]+:[A-Z]+[0-9]+\$[0-9]+)',  # 绝对范围引用
                            ]
                            
                            for pattern in formula_patterns:
                                matches = re.findall(pattern, str(value))
                                if matches:
                                    analysis_result['formulas'].append({
                                        'sheet': sheet,
                                        'cell': f"{col}{idx+1}",
                                        'formula': str(value),
                                        'references': matches
                                    })
                                    analysis_result['references'].extend(matches)
                
            except Exception as e:
                print(f"分析工作表 {sheet} 时出错: {e}")
                analysis_result['sheets'][sheet] = {'error': str(e)}
        
        return analysis_result
        
    except Exception as e:
        print(f"分析文件 {file_path} 时出错: {e}")
        return None

def main():
    """主函数"""
    base_path = "/Users/mr.wang/代码库/绿纤/lvqianmeiye_ERP/参考资料/工资核算 -7月"
    
    # 要分析的文件列表
    files_to_analyze = [
        "工资(全字段).xlsx",
        "①B-a-26考勤汇总表.xlsx",
        "②B-a-17每日早报.xlsx", 
        "③B-a-③呈现-消耗明细表.xlsx",
        "④B-a-25社保统计表.xlsx",
        "⑤B-a-12奖励统计表.xlsx",
        "B-b-③健康师底薪.xlsx",
        "B-b-④健康师提成-金三角顾问.xlsx",
        "B-b-⑤其他岗位工资.xlsx",
        "B-b-⑤当月数据及门店毛利.xlsx"
    ]
    
    all_results = {}
    
    print("开始分析工资核算Excel文件...")
    print("=" * 60)
    
    for filename in files_to_analyze:
        file_path = os.path.join(base_path, filename)
        if os.path.exists(file_path):
            result = analyze_excel_file(file_path)
            if result:
                all_results[filename] = result
        else:
            print(f"文件不存在: {filename}")
    
    # 生成分析报告
    generate_report(all_results)

def generate_report(all_results):
    """生成分析报告"""
    report_path = "/Users/mr.wang/代码库/绿纤/lvqianmeiye_ERP/工资核算Excel分析报告.md"
    
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write("# 工资核算Excel文件分析报告\n\n")
        f.write("## 概述\n\n")
        f.write("本报告分析了工资核算相关的Excel文件,提取了字段信息、公式引用关系等关键数据。\n\n")
        
        # 文件概览
        f.write("## 文件概览\n\n")
        f.write("| 文件名 | 工作表数量 | 总列数 | 状态 |\n")
        f.write("|--------|------------|--------|------|\n")
        
        for filename, result in all_results.items():
            if result:
                sheet_count = len(result['sheets'])
                total_columns = len(result['all_columns'])
                f.write(f"| {filename} | {sheet_count} | {total_columns} | ✅ |\n")
            else:
                f.write(f"| {filename} | - | - | ❌ |\n")
        
        # 详细分析
        f.write("\n## 详细分析\n\n")
        
        for filename, result in all_results.items():
            if not result:
                continue
                
            f.write(f"### {filename}\n\n")
            
            # 工作表信息
            f.write("#### 工作表信息\n\n")
            for sheet_name, sheet_info in result['sheets'].items():
                if 'error' in sheet_info:
                    f.write(f"- **{sheet_name}**: 分析出错 - {sheet_info['error']}\n")
                else:
                    f.write(f"- **{sheet_name}**: {sheet_info['rows']}行 x {sheet_info['columns']}列\n")
                    f.write(f"  - 列名: {', '.join([str(col) for col in sheet_info['column_names']])}\n")
            
            # 字段统计
            f.write(f"\n#### 字段统计\n\n")
            f.write(f"总字段数: {len(result['all_columns'])}\n\n")
            f.write("所有字段列表:\n")
        for col in sorted(result['all_columns'], key=str):
            f.write(f"- {col}\n")
            
            # 公式引用
            if result['formulas']:
                f.write(f"\n#### 公式引用\n\n")
                f.write(f"发现 {len(result['formulas'])} 个公式引用:\n\n")
                for formula in result['formulas'][:10]:  # 只显示前10个
                    f.write(f"- **{formula['cell']}** ({formula['sheet']}): {formula['formula']}\n")
                    f.write(f"  - 引用: {', '.join(formula['references'])}\n")
                
                if len(result['formulas']) > 10:
                    f.write(f"\n... 还有 {len(result['formulas']) - 10} 个公式\n")
            
            f.write("\n---\n\n")
        
        # 字段汇总
        f.write("## 字段汇总\n\n")
        all_columns = set()
        for result in all_results.values():
            if result:
                all_columns.update(result['all_columns'])
        
        f.write(f"所有文件共发现 {len(all_columns)} 个唯一字段:\n\n")
        for col in sorted(all_columns, key=str):
            f.write(f"- {col}\n")
        
        # 引用关系汇总
        f.write("\n## 引用关系汇总\n\n")
        all_references = set()
        for result in all_results.values():
            if result:
                all_references.update(result['references'])
        
        f.write(f"发现 {len(all_references)} 个唯一引用:\n\n")
        for ref in sorted(all_references, key=str):
            f.write(f"- {ref}\n")
    
    print(f"\n分析报告已生成: {report_path}")

if __name__ == "__main__":
    main()