注意：請先在cmd中執行以下這行pip程式碼
pip install pandas scikit-learn xgboost shap matplotlib



以下為在Idle平台中執行的XGBoost程式碼：

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
import shap
import matplotlib.pyplot as plt

# 設定顯示中文，以防 SHAP 圖表亂碼 (如果在 IDLE 執行圖表時遇到問題，可能需要調整)
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# plt.rcParams['axes.unicode_minus'] = False

# ==========================================================
# 步驟 1: 資料載入與 One-Hot 編碼 (資料預處理)
# ==========================================================

# *** 程式碼微調處：使用您的新檔名 ***
FILE_NAME = 'expense_fraud_dataset_v2.csv' 
# ***********************************

try:
    df = pd.read_csv(FILE_NAME)
except FileNotFoundError:
    print(f"錯誤：找不到檔案 {FILE_NAME}。請確認檔案與腳本在同一個資料夾。")
    exit()

# 移除日期欄位 (Date)，它已被 Is_Weekend 欄位間接取代
df_processed = df.drop('Date', axis=1)

# One-Hot 編碼：處理類別特徵
categorical_cols = ['Department', 'Category']
df_encoded = pd.get_dummies(df_processed, 
                             columns=categorical_cols, 
                             drop_first=True, # 避免共線性
                             dtype=int)

# 分離特徵 X 與目標變數 y
X = df_encoded.drop('Is_Fraud', axis=1)
y = df_encoded['Is_Fraud']

# 切分訓練集與測試集 (80% 訓練, 20% 測試)
# stratify=y 確保舞弊樣本比例在兩集中一致
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

# 計算正向權重 (處理類別不平衡問題)
# 權重 = 正常樣本數 / 舞弊樣本數
scale_pos_weight_value = (len(y_train) - y_train.sum()) / y_train.sum()

print("資料預處理完成。")
print(f"訓練集中舞弊與正常的權重比 (scale_pos_weight): {scale_pos_weight_value:.2f}")


# ==========================================================
# 步驟 2: XGBoost 模型訓練
# ==========================================================
print("\n========== 開始訓練 XGBoost 模型 ==========")

model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,          # 樹的數量
    max_depth=4,               # 每棵樹的最大深度
    learning_rate=0.1,         # 學習率
    scale_pos_weight=scale_pos_weight_value, # 處理類別不平衡
    use_label_encoder=False,   # 避免過時警告
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)
print("模型訓練完成。")


# ==========================================================
# 步驟 3: 模型評估與教學展示
# ==========================================================
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\n========== 測試集模型評估結果 ==========")
print("--- 混淆矩陣 ---")
print(confusion_matrix(y_test, y_pred))

print("\n--- 完整分類報告 ---")
# 報告包含：精準率(Precision)、召回率(Recall)、F1-Score
print(classification_report(y_test, y_pred, target_names=['正常(0)', '舞弊(1)']))

print(f"\nAUC 分數 (模型整體區分能力): {roc_auc_score(y_test, y_proba):.4f}")


# ==========================================================
# 步驟 4: SHAP 模型可解釋性 (教學亮點)
# ==========================================================
print("\n========== SHAP 模型解釋：關鍵洞察 ==========")

# 1. 全域解釋：特徵重要性 (Feature Importance)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

print("--- 輸出特徵重要性總結圖 (Summary Plot) ---")
# 顯示哪個特徵對整體預測影響最大
shap.summary_plot(shap_values, X_test, plot_type="bar")
# 在 IDLE 環境中執行此行，圖表會在新的視窗中彈出

# 2. 局部解釋：找出風險最高的單一案例
top_risky_indices = np.argsort(y_proba)[-5:][::-1]
riskiest_case_index = top_risky_indices[0] # 選取風險最高的單一筆交易

print(f"\n--- 案例分析：風險最高的交易 (Index {X_test.index[riskiest_case_index]}) ---")
print(X_test.iloc[riskiest_case_index])

print("\n--- 輸出單一案例的解釋圖 (Force Plot) ---")
# 顯示該筆交易的哪個特徵推高或推低了舞弊風險
shap.initjs()
shap.force_plot(explainer.expected_value, 
                 shap_values[riskiest_case_index], 
                 X_test.iloc[riskiest_case_index])

# 由於 force_plot 在 IDLE 可能需要額外設定環境，建議在教學時使用 Jupyter/Colab 環境
# 或只展示 summary_plot 即可。
plt.show() # 確保圖表顯示