← back to Resume

Introduction

Rail Break Prediction AI project focuses on building a data pipeline to extract, enrich, and analyse real-world data using machine learning models, with the goal of predicting rail breaks within the coming 30 days, utilising the Insight Factory platform.

Software Architecture

Architecture.png

Images

截屏2024-10-18 上午2.54.20.png

Code - XAI of Decision Tree

0 - data and train

df = spark.sql("""select * from demo.training_table""")
df = df.fillna(0)
# select data
feature_columns = ['BrakeCylinder', 'IntrainForce', 'SND']
target_column = 'target'
# 提取 X 和 y 在 PySpark 中
X = df.select(feature_columns).toPandas()
y = df.select(target_column).toPandas()

print(X.columns)
print(y.columns)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. create a tree classifier
clf = DecisionTreeClassifier()

# 3. train
clf.fit(X_train, y_train)

# 4. predict
y_pred = clf.predict(X_test)

# 5. get accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# F1 score
f1 = f1_score(y_test, y_pred, average='binary')  
	# You can also use 'micro' or 'macro' or 'weighted' depending on your need
print(f"F1 Score: {f1}")

true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(len(y_pred)):
    pred = y_pred[i] != 0
    real = y_test.iloc[i][0] != 0
    # if y_pred[i] != 0 or y_test.iloc[i][0] != 0:
        # print(y_pred[i], y_test.iloc[i][0])
    if pred == 0 and real == 0:
        true_negative += 1
    elif pred == real == 1:
        true_positive += 1
    elif pred == 0 and real == 1:
        false_negative += 1
    elif pred == 1 and real == 0:
        false_positive += 1
print("true positive = ", true_positive)
print("true negative = ", true_negative)
print("false positive = ", false_positive)
print("false negative = ", false_positive)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
print("precision = ", precision)
print("recall = ", recall)
print("f1 = ", precision * recall * 2 / (precision + recall))
print("my percision = ", true_positive / (true_positive + false_positive + false_negative))
# test
print(type(y_test))
print(y_test.shape)
print(y_pred.shape)
print(y_test.iloc[0])
print(y_test.iloc[0][0])
'_______________________________'
print(clf)
print(clf.classes_)
print(clf.n_features_in_)

0 - Control Center

# control center
class_number = clf.n_classes_
feature_number = clf.n_features_in_
print(f"class number   = {class_number} -> {clf.classes_}")
print(f"feature number = {feature_number}")

# decision tree model = clf
class_names = [f"class{i}" for i in range(class_number)]  # 2 class
# feature_names = [str(i) for i in range(feature_number)]  # 3 feature
feature_names = ['BrakeCylinder', 'IntrainForce', 'SND']

1 - visualise tree

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# visualise decision tree
temp = plt.figure(figsize=(100, 30))
temp = tree.plot_tree(clf, feature_names=feature_names, class_names=class_names, filled=True, max_depth=5, fontsize=12)
plt.show()

image.png

2 - Feature Importance