-
Notifications
You must be signed in to change notification settings - Fork 0
/
RendomForest.py
149 lines (130 loc) · 5.59 KB
/
RendomForest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_curve,
roc_auc_score, precision_score, recall_score, f1_score)
# Importing the dataset
datafile = "https://raw.githubusercontent.com/AbdulMoaizz/dataset/main/telecom_customer_churn.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(datafile)
# Drop unnecessary columns
drop_columns = ['Customer ID', 'City', 'Zip Code', 'Latitude',
'Longitude', 'Number of Referrals',
'Device Protection Plan', 'Premium Tech Support',
'Total Refunds', 'Total Revenue', 'Churn Category', 'Churn Reason']
df = df.drop(drop_columns, axis=1)
# Drop rows where Customer Status is 'Joined'
df = df[df['Customer Status'] != 'Joined']
# Fill missing values
df['Offer'] = df['Offer'].fillna('No Offer')
df['Avg Monthly Long Distance Charges'] = df['Avg Monthly Long Distance Charges'].fillna(0)
df['Multiple Lines'] = df['Multiple Lines'].fillna('No')
df['Avg Monthly GB Download'] = df['Avg Monthly GB Download'].fillna(np.random.randint(10, 30))
df[['Online Security', 'Online Backup',
'Streaming TV', 'Streaming Movies',
'Streaming Music', 'Unlimited Data'
]] = df[['Online Security', 'Online Backup',
'Streaming TV', 'Streaming Movies',
'Streaming Music', 'Unlimited Data'
]].fillna('No')
# Fill missing values with the randomly sampled values in Internet Type
values_distribution = df['Internet Type'].value_counts(normalize=True)
missing_index = df[df['Internet Type'].isnull()].index
random_samples = np.random.choice(values_distribution.index, size=len(missing_index), p=values_distribution.values)
df.loc[missing_index, 'Internet Type'] = random_samples
# Mapping for numerical values
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
df['Offer'] = df['Offer'].map({'No Offer': 0, 'Offer A': 1, 'Offer B': 2, 'Offer C': 3, 'Offer D': 4, 'Offer E': 5})
df['Phone Service'] = df['Phone Service'].map({'Yes': 1, 'No': 0})
df['Multiple Lines'] = df['Multiple Lines'].map({'Yes': 1, 'No': 0})
df['Internet Service'] = df['Internet Service'].map({'Yes': 1, 'No': 0})
df['Internet Type'] = df['Internet Type'].map({'Cable': 1, 'Fiber Optic': 2, 'DSL': 3, '4G': 4})
df['Online Security'] = df['Online Security'].map({'Yes': 1, 'No': 0})
df['Online Backup'] = df['Online Backup'].map({'Yes': 1, 'No': 0})
df['Streaming TV'] = df['Streaming TV'].map({'Yes': 1, 'No': 0})
df['Streaming Movies'] = df['Streaming Movies'].map({'Yes': 1, 'No': 0})
df['Streaming Music'] = df['Streaming Music'].map({'Yes': 1, 'No': 0})
df['Unlimited Data'] = df['Unlimited Data'].map({'Yes': 1, 'No': 0})
df['Contract'] = df['Contract'].map({'Month-to-Month': 0, 'One Year': 1, 'Two Year': 2})
df['Paperless Billing'] = df['Paperless Billing'].map({'Yes': 1, 'No': 0})
df['Payment Method'] = df['Payment Method'].map({'Credit Card': 1, 'Bank Withdrawal': 2, 'Mailed Check': 3})
df['Customer Status'] = df['Customer Status'].map({'Churned': 0, 'Stayed': 1})
# Separate features and target
X = df.drop('Customer Status', axis=1)
y = df['Customer Status']
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
# Training the model
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=50)
rf.fit(X_train, y_train)
# Predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
# F1 Score
print("F1 Score:", f1_score(y_train, y_train_pred))
# Predict with new data
new_data = pd.DataFrame({
'Gender': [0],
'Age': [100],
'Married': [1],
'Number of Dependents': [1],
'Tenure in Months': [7],
'Offer': [2],
'Phone Service': [1],
'Avg Monthly Long Distance Charges': [40],
'Multiple Lines': [1],
'Internet Service': [1],
'Internet Type': [1],
'Avg Monthly GB Download': [50],
'Online Security': [1],
'Online Backup': [1],
'Streaming TV': [1],
'Streaming Movies': [1],
'Streaming Music': [1],
'Unlimited Data': [1],
'Contract': [1],
'Paperless Billing': [0],
'Payment Method': [2],
'Monthly Charge': [50],
'Total Charges': [350.99],
'Total Extra Data Charges': [10],
'Total Long Distance Charges': [90]
})
# Predict
new_prediction = rf.predict(new_data)
if new_prediction[0] == 0:
print('New Customer is predicted to churn.')
else:
print('New Customer is predicted to stay.')
# Evaluate model performance
# Cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='f1')
print("Cross-Validation F1 Score:", np.mean(cv_scores))
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Stayed', 'Churned'],
yticklabels=['Stayed', 'Churned'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Evaluate using ROC curve and AUC score
y_test_prob = rf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
print("AUC Score:", roc_auc_score(y_test, y_test_prob))
# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)