E_Commerce Data model

7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 1/38
Numeric variables count: 8
Categorical variables count: 4
In [2]: import pandas as pd
# Uploading the actual file
df = pd.read_csv('E_Commerce.csv')
In [3]: # Calculate the count of numeric variables
numeric_count = df.select_dtypes(include='number').shape[1]
# Calculate the count of categorical variables
categorical_count = df.select_dtypes(include='object').shape[1]
print("Numeric variables count:", numeric_count)
print("Categorical variables count:", categorical_count)
In [9]: data_dict = pd.DataFrame(columns=['Variable Name', 'Description', 'Type'])
# Add variable information to the data dictionary
data_dict['Variable Name'] = df.columns
data_dict['Description'] = ['ID Number of Customers',
'The Company's warehouse block (A, B, C, D, E)',
'The shipping mode (Ship, Flight, Road)',
'Number of calls made by the customer for shipment inqu
'Customer rating (1 to 5, with 1 being the lowest)',
'Cost of the product in US Dollars',
'Number of prior purchases',
'Importance of the product (low, medium, high)',
'Gender of the customer (Male, Female)',
'Discount offered on the product',
'Weight of the product in grams',
'Target variable indicating whether the product reached
# Determine the type of each variable
data_dict['Type'] = df.dtypes.values
# Display the data dictionary
data_dict

7/14/23, 10:47 PM E_Commerce
Variable Name Description Type
0 ID ID Number of Customers int64
1 Warehouse_block The Company's warehouse block (A, B, C, D, E) object
2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) object
3 Customer_care_calls Number of calls made by the customer for shipm... int64
4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) int64
5 Cost_of_the_Product Cost of the product in US Dollars int64
6 Prior_purchases Number of prior purchases int64
7 Product_importance Importance of the product (low, medium, high) object
8 Gender Gender of the customer (Male, Female) object
9 Discount_offered Discount offered on the product int64
10 Weight_in_gms Weight of the product in grams int64
11 Reached.on.Time_Y.N Target variable indicating whether the product... int64
Variable Name Description Type
0 ID ID Number of Customers Numerical
1 Warehouse_block The Company's warehouse block (A, B, C, D, E) Categorical
2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) Categorical
3 Customer_care_calls Number of calls made by the customer for shipm... Numerical
4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) Numerical
5 Cost_of_the_Product Cost of the product in US Dollars Numerical
6 Prior_purchases Number of prior purchases Numerical
7 Product_importance Importance of the product (low, medium, high) Categorical
8 Gender Gender of the customer (Male, Female) Categorical
9 Discount_offered Discount offered on the product Numerical
10 Weight_in_gms Weight of the product in grams Numerical
11 Reached.on.Time_Y.N Target variable indicating whether the product... Numerical
Out[9]:
In [10]: # Determine the type of each variable
data_dict['Type'] = df.dtypes.replace({'int64': 'Numerical', 'object': 'Categorical
# Display the data dictionary
data_dict
Out[10]:
In [11]: # Count of missing/null values
missing_values_count = df.isnull().sum()
# Redundant columns
redundant_columns = []
# Identify redundant columns (columns with constant values)
for column in df.columns:
if df[column].nunique() == 1:
redundant_columns.append(column)

7/14/23, 10:47 PM E_Commerce
Count of missing/null values:
ID 0
Warehouse_block 0
Mode_of_Shipment 0
Customer_care_calls 0
Customer_rating 0
Cost_of_the_Product 0
Prior_purchases 0
Product_importance 0
Gender 0
Discount_offered 0
Weight_in_gms 0
Reached.on.Time_Y.N 0
dtype: int64
Redundant columns:
[]
# Print the count of missing/null values
print("Count of missing/null values:")
print(missing_values_count)
# Print the redundant columns
print("nRedundant columns:")
print(redundant_columns)
import matplotlib.pyplot as plt
import seaborn as sns
# Read the dataset into a pandas DataFrame
# Relationship between variables
sns.pairplot(df)
plt.show()

7/14/23, 10:47 PM E_Commerce
In [14]: # Check for multicollinearity
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix")
plt.show()

7/14/23, 10:47 PM E_Commerce
In [15]: # Distribution of variables
df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()
In [16]: # Presence of outliers
df.boxplot(figsize=(10, 8))
plt.show()

7/14/23, 10:47 PM E_Commerce
In [17]: # Statistical significance of variables
# Class imbalance
sns.countplot(x='Reached.on.Time_Y.N', data=df)
plt.show()
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
# Transformation of Numerical Features
df['transformed_weight'] = np.sqrt(df['Weight_in_gms'])

7/14/23, 10:47 PM E_Commerce
In [22]: # Scaling the Data
scaler = StandardScaler()
df['scaled_weight'] = scaler.fit_transform(df[['Weight_in_gms']])
In [ ]: # Feature Selection
selector = SelectKBest(score_func=chi2, k=5) # Select top 5 features
selected_features = selector.fit_transform(df[['Customer_care_calls', 'Customer_rat
In [23]: # Dimensionality Reduction
pca = PCA(n_components=2) # Reduce to 2 principal components
reduced_features = pca.fit_transform(df[['Customer_care_calls', 'Customer_rating',
In [24]: # Print the updated DataFrame with transformed, scaled, selected, and reduced featu
print(df)

7/14/23, 10:47 PM E_Commerce
ID Warehouse_block Mode_of_Shipment Customer_care_calls
0 1 D Flight 4
1 2 F Flight 4
2 3 A Flight 2
3 4 B Flight 3
4 5 C Flight 2
... ... ... ... ...
10994 10995 A Ship 4
10995 10996 B Ship 4
10996 10997 C Ship 5
10997 10998 F Ship 5
10998 10999 D Ship 2
Customer_rating Cost_of_the_Product Prior_purchases
0 2 177 3
1 5 216 2
2 2 183 4
3 3 176 4
4 2 184 3
... ... ... ...
10994 1 252 5
10995 1 232 5
10996 4 242 5
10997 2 223 6
10998 5 155 5
Product_importance Gender Discount_offered Weight_in_gms
0 low F 44 1233
1 low M 59 3088
2 low M 48 3374
3 medium M 10 1177
4 medium F 46 2484
... ... ... ... ...
10994 medium F 1 1538
10995 medium F 6 1247
10996 low F 4 1155
10997 medium M 2 1210
10998 low F 6 1639
Reached.on.Time_Y.N transformed_weight scaled_weight
0 1 35.114100 -1.468240
1 1 55.569776 -0.333893
2 1 58.086143 -0.159002
3 1 34.307434 -1.502484
4 1 49.839743 -0.703244
... ... ... ...
10994 1 39.217343 -1.281730
10995 0 35.312887 -1.459679
10996 0 33.985291 -1.515937
10997 0 34.785054 -1.482304
10998 0 40.484565 -1.219968
[10999 rows x 14 columns]
In [25]: from sklearn.linear_model import LinearRegression
# Create the SLR model
model_slr = LinearRegression()
# Prepare the features and target variables
X_slr = df[['Cost_of_the_Product']]
y_slr = df['Weight_in_gms']
# Fit the model

7/14/23, 10:47 PM E_Commerce
R-squared: 0.01758383342649017
Mean Squared Error: 2627192.6380324196
model_slr.fit(X_slr, y_slr)
# Predict the target variable
y_pred_slr = model_slr.predict(X_slr)
# Create the Multiple Linear Regression model
model_multiple = LinearRegression()
X_multiple = df[['Cost_of_the_Product', 'Prior_purchases']]
y_multiple = df['Weight_in_gms']
# Fit the model
model_multiple.fit(X_multiple, y_multiple)
y_pred_multiple = model_multiple.predict(X_multiple)
from sklearn.metrics import r2_score, mean_squared_error
# Create the SLR model
model_slr = LinearRegression()
X_slr = df[['Cost_of_the_Product']]
y_slr = df['Weight_in_gms']
# Fit the model
model_slr.fit(X_slr, y_slr)
y_pred_slr = model_slr.predict(X_slr)
# Calculate R-squared
r2 = r2_score(y_slr, y_pred_slr)
# Calculate MSE
mse = mean_squared_error(y_slr, y_pred_slr)
# Print the R-squared and MSE values
print("R-squared:", r2)
print("Mean Squared Error:", mse)
In [29]: import matplotlib.pyplot as plt
# Extract the column values for plotting
x_values = df['Cost_of_the_Product']
y_actual = df['Weight_in_gms']
# Plot the actual data points
plt.scatter(x_values, y_actual, color='blue', label='Actual')
# Plot the regression line
plt.plot(x_values, y_pred_slr, color='red', label='Regression Line')
# Set plot labels and title
plt.xlabel('Cost of the Product')

7/14/23, 10:47 PM E_Commerce
plt.ylabel('Weight in grams')
plt.title('Simple Linear Regression')
# Display legend
plt.legend()
# Show the plot
plt.show()
In [42]: import matplotlib.pyplot as plt
# Plot the actual values
plt.scatter(X_multiple['Cost_of_the_Product'], y_multiple, color='blue', label='Act
# Plot the predicted values
plt.scatter(X_multiple['Cost_of_the_Product'], y_pred_multiple, color='red', label=
# Add the Prior_purchases column to the plot
plt.scatter(X_multiple['Cost_of_the_Product'], X_multiple['Prior_purchases'], color
plt.xlabel('Cost_of_the_Product')
plt.ylabel('Weight_in_gms')
plt.title('Actual vs Predicted Values')
# Add a legend
plt.legend()
# Display the plot
plt.show()

7/14/23, 10:47 PM E_Commerce
In [46]: from sklearn.linear_model import LogisticRegression
# Create the Logistic Regression model
model_logistic = LogisticRegression()
X_logistic = df[['Cost_of_the_Product', 'Prior_purchases']]
y_logistic = df['Reached.on.Time_Y.N']
# Fit the model
model_logistic.fit(X_logistic, y_logistic)
y_pred_logistic = model_logistic.predict(X_logistic)
# Create a scatter plot of the predicted values
plt.scatter(range(len(y_pred_logistic)), y_pred_logistic, color='blue', label='Pred
# Plot the actual values
plt.scatter(range(len(y_logistic)), y_logistic, color='red', label='Actual')
plt.xlabel('Data Point')
plt.ylabel('Reached on Time')
plt.title('Logistic Regression Predictions')
# Add a legend
plt.legend()
# Show the plot
plt.show()

7/14/23, 10:47 PM E_Commerce
Accuracy: 0.5545454545454546
In [52]: from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the Decision Tree classifier
model_decision_tree = DecisionTreeClassifier()
# Fit the model on the training data
model_decision_tree.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_decision_tree.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [53]: from sklearn.ensemble import RandomForestClassifier
# Create the Random Forest classifier
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train, y_train)
y_pred = model_random_forest.predict(X_test)

7/14/23, 10:47 PM E_Commerce
Accuracy: 0.5622727272727273
Accuracy: 0.5931818181818181
Accuracy: 0.5636363636363636
In [54]: from sklearn.svm import SVC
# Create the SVM classifier
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)
In [55]: from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# Create the base classifier
base_classifier = DecisionTreeClassifier()
# Create the bagging classifier
model_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=10,
model_bagging.fit(X_train, y_train)
y_pred = model_bagging.predict(X_test)
In [56]: from sklearn.ensemble import AdaBoostClassifier

7/14/23, 10:47 PM E_Commerce
Accuracy: 0.5645454545454546
# Create the base classifier
base_classifier = DecisionTreeClassifier()
# Create the AdaBoost classifier
model_adaboost = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=10
model_adaboost.fit(X_train, y_train)
y_pred = model_adaboost.predict(X_test)
In [57]: from sklearn.decomposition import PCA
# Prepare the features
# Standardize the features
X_scaled = scaler.fit_transform(X)
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Plot the data points in the reduced dimension space
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA')
plt.show()

7/14/23, 10:47 PM E_Commerce
In [58]: from sklearn.cluster import KMeans
# Prepare the features
# Standardize the features
# Instantiate the K-Means clustering algorithm
kmeans = KMeans(n_clusters=3, random_state=42)
# Fit the model on the scaled data
kmeans.fit(X_scaled)
# Predict the cluster labels
labels = kmeans.predict(X_scaled)
# Plot the data points with color-coded clusters
plt.scatter(X['Cost_of_the_Product'], X['Prior_purchases'], c=labels)
plt.xlabel('Cost of the Product')
plt.ylabel('Prior Purchases')
plt.title('K-Means Clustering')
plt.show()

7/14/23, 10:47 PM E_Commerce
Accuracy: 0.5895454545454546
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
# Split the data into training and testing sets
from sklearn.linear_model import LogisticRegression
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
In [5]: import numpy as np
# Fit the logistic regression model
# Predict on the training set
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Calculate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
# Calculate the bias error
bias_error = np.mean(np.abs(y_train_pred - y_train))
# Calculate the variance error
variance_error = np.mean(np.abs(y_test_pred - y_test))
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Bias Error:", bias_error)
print("Variance Error:", variance_error)

7/14/23, 10:47 PM E_Commerce
Training Accuracy: 0.599613592453688
Test Accuracy: 0.5895454545454546
Bias Error: 0.4003864075463121
Variance Error: 0.41045454545454546
Cross-Validation Scores: [0.59727273 0.59681818 0.59681818 0.55954545 0.55025011]
Mean CV Accuracy: 0.5801409318285171
Ensemble Model Accuracy: 0.5281818181818182
Best Parameters: {'C': 0.1}
Best Model Accuracy: 0.5895454545454546
In [6]: from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the logistic regression model
# Perform cross-validation
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cross_val_scores)
print("Mean CV Accuracy:", cross_val_scores.mean())
In [7]: # Use ensemble models (Random Forest)
ensemble_model = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_model.fit(X_train, y_train)
ensemble_accuracy = ensemble_model.score(X_test, y_test)
print("Ensemble Model Accuracy:", ensemble_accuracy)
In [8]: # Perform hyperparameter tuning using GridSearchCV
parameters = {'C': [0.1, 1, 10]}
grid_search = GridSearchCV(model, parameters, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
In [9]: # Use the best parameters to fit the model
best_model = LogisticRegression(C=best_params['C'])
best_model.fit(X_train, y_train)
best_model_accuracy = best_model.score(X_test, y_test)
print("Best Model Accuracy:", best_model_accuracy)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Select the relevant columns for regression
# and 'Reached on time' as the dependent variable
# Create and fit the linear regression model
model = LinearRegression()

7/14/23, 10:47 PM E_Commerce
RMSE: 0.49030942756132945
R-squared: 0.0037867760725102118
Classification Report:
precision recall f1-score support
0 0.43 0.03 0.05 895
1 0.59 0.98 0.74 1305
accuracy 0.59 2200
macro avg 0.51 0.50 0.39 2200
weighted avg 0.53 0.59 0.46 2200
AUC-ROC Score: 0.5386027098182752
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# Calculate R-squared value
r2 = r2_score(y_test, y_pred)
# Print the evaluation metrics
print("RMSE:", rmse)
print("R-squared:", r2)
from sklearn.metrics import classification_report, roc_auc_score
# Select the relevant columns for classification
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of positive class
# Generate classification report
classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)
# Calculate AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_roc)
from sklearn.cluster import KMeans

7/14/23, 10:47 PM E_Commerce
Inertia Value: 7956.899346042599
# Read the E-commerce dataset into a pandas DataFrame
# Select the relevant columns for clustering
# For example, let's consider 'Customer care calls' and 'Cost of the product' as th
X = df[['Customer_care_calls', 'Cost_of_the_Product']]
# Perform feature scaling
# Create and fit the K-means clustering model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)
# Calculate the inertia value
inertia = kmeans.inertia_
# Print the inertia value
print("Inertia Value:", inertia)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Define the models to compare
models = [
('Logistic Regression', LogisticRegression()),
('Decision Tree', DecisionTreeClassifier()),
('Random Forest', RandomForestClassifier())
]
# Iterate over the models
for model_name, model in models:
# Make predictions on the test data
# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

7/14/23, 10:47 PM E_Commerce
Model: Logistic Regression
Accuracy: 0.5895454545454546
Precision: 0.5936626281453867
Recall: 0.9762452107279693
F1-score: 0.7383367139959431
-------------------------
Model: Decision Tree
Accuracy: 0.5109090909090909
Precision: 0.5984522785898538
Recall: 0.5333333333333333
F1-score: 0.5640194489465155
-------------------------
Model: Random Forest
Accuracy: 0.5322727272727272
Precision: 0.5997109826589595
Recall: 0.6360153256704981
F1-score: 0.6173298624023801
-------------------------
# Print the results
print(f"Model: {model_name}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print("-------------------------")
In [17]: import time
import pandas as pd
start_time = time.time()
end_time = time.time()
# Calculate the time taken for model training
training_time = end_time - start_time
print("Training Time:", training_time)
# Make predictions on the test set
start_time = time.time()
end_time = time.time()
# Calculate the time taken for making predictions
prediction_time = end_time - start_time
print("Prediction Time:", prediction_time)

7/14/23, 10:47 PM E_Commerce
Training Time: 0.04000043869018555
Prediction Time: 0.002998828887939453
# Select the relevant columns for EDA
# For example, let's consider 'Customer care calls', 'Customer rating', 'Cost of th
selected_columns = ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product
df_selected = df[selected_columns]
# Correlation Matrix
correlation_matrix = df_selected.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
In [20]: # Pair Plots
sns.pairplot(df_selected, hue='Reached.on.Time_Y.N', diag_kind='kde')
plt.title('Pair Plots')
plt.show()

7/14/23, 10:47 PM E_Commerce
In [21]: # Box Plots
for i, column in enumerate(selected_columns[:-1]):
plt.subplot(2, 2, i+1)
sns.boxplot(x='Reached.on.Time_Y.N', y=column, data=df_selected)
plt.xlabel('Reached.on.Time_Y.N')
plt.ylabel(column)
plt.tight_layout()
plt.suptitle('Box Plots', y=1.05)
plt.show()

7/14/23, 10:47 PM E_Commerce
In [22]: # Distribution Plots
for i, column in enumerate(selected_columns[:-1]):
plt.subplot(2, 2, i+1)
sns.histplot(data=df_selected, x=column, hue='Reached.on.Time_Y.N', kde=True)
plt.xlabel(column)
plt.ylabel('Count')
plt.tight_layout()
plt.suptitle('Distribution Plots', y=1.05)
plt.show()

7/14/23, 10:47 PM E_Commerce
Accuracy: 0.5895454545454546
# Calculate accuracy
import plotly.express as px
import plotly.graph_objects as go
# Model Parameters
model_params = {
'solver': 'lbfgs',
'C': 1.0,
'max_iter': 100,
# Add other model parameters as needed
}
# Create a bar chart for model parameters
fig = px.bar(
x=list(model_params.keys()),
y=list(model_params.values()),
labels={'x': 'Parameter', 'y': 'Value'},
title='Model Parameters'
)
fig.show()
# Create a table for model parameters
table_data = [['Parameter', 'Value']]
table_data.extend(list(model_params.items()))
fig = go.Figure(data=[go.Table(header=dict(values=table_data[0]), cells=dict(values
fig.show()

7/14/23, 10:47 PM E_Commerce

7/14/23, 10:47 PM E_Commerce
import numpy as np
# Define a range of parameter values to test
parameter_values = [0.001, 0.01, 0.1, 1, 10, 100]
# Initialize lists to store parameter values and corresponding accuracy scores
parameters = []
accuracy_scores = []
# Iterate over the parameter values
for param in parameter_values:
# Create and fit the logistic regression model with the current parameter value
model = LogisticRegression(C=param)

7/14/23, 10:47 PM E_Commerce
# Calculate accuracy and append to the lists
parameters.append(param)
accuracy_scores.append(accuracy)
# Plot the performance with varying parameters
plt.plot(parameters, accuracy_scores, marker='o')
plt.xlabel('Parameter Value')
plt.ylabel('Accuracy')
plt.title('Model Performance with Varying Parameters')
plt.xticks(np.arange(min(parameters), max(parameters) + 1, 1.0))
plt.show()
from sklearn.metrics import classification_report
# Generate classification report
classification_metrics = classification_report(y_test, y_pred, output_dict=True)

7/14/23, 10:47 PM E_Commerce
Metrics not available for class label: accuracy
# Extract metrics and class labels
metrics = ['precision', 'recall', 'f1-score', 'support']
class_labels = list(classification_metrics.keys())[:-1] # Exclude 'macro avg' and
# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame(index=class_labels, columns=metrics)
for label in class_labels:
if isinstance(classification_metrics[label], dict):
metrics_df.loc[label] = [classification_metrics[label][metric] for metric i
else:
print(f"Metrics not available for class label: {label}")
# Plot the metrics
sns.set_style("whitegrid")
metrics_df.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Class Labels')
plt.ylabel('Metric Score')
plt.title('Model Metrics - Classification Report')
plt.legend(title='Metrics', bbox_to_anchor=(1, 1))
plt.show()
model.fit(X, y)
# Predict the target variable for the entire dataset
y_pred = model.predict(X)

7/14/23, 10:47 PM E_Commerce
Reached.on.Time_Y.N Predicted Reached on time
0 1 1
1 1 1
2 1 1
3 1 1
4 1 1
... ... ...
10994 1 1
10995 0 1
10996 0 1
10997 0 1
10998 0 1
[10999 rows x 2 columns]
Percentage of correct predictions: 59.6872442949359
# Add the predicted values to the DataFrame
df['Predicted Reached on time'] = y_pred
# Print the testing outcome of the whole E-commerce dataset
print(df[['Reached.on.Time_Y.N', 'Predicted Reached on time']])
model.fit(X, y)
# Predict the target variable for the entire dataset
y_pred = model.predict(X)
# Calculate the accuracy
accuracy = accuracy_score(y, y_pred)
percentage_correct = accuracy * 100
# Print the percentage of correct predictions
print("Percentage of correct predictions:", percentage_correct)
# Read the E-commerce dat# Filter successful and obvious cases
success_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 0] # Assuming 0
obvious_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1
# Create visualizations for the successful cases
# Example: Histogram of Customer Ratings
plt.hist(success_cases['Customer_rating'], bins=10, alpha=0.5, color='green')
plt.xlabel('Customer_rating')
plt.ylabel('Count')

7/14/23, 10:47 PM E_Commerce
plt.title('Distribution of Customer_ratings for Successful Deliveries')
plt.show()
# Create visualizations for the obvious cases
# Example: Scatter Plot of Cost of the Product vs. Customer Rating
fig = go.Figure(data=go.Scatter(
x=obvious_cases['Cost_of_the_Product'],
y=obvious_cases['Customer_rating'],
mode='markers',
marker=dict(color='red')
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating for Obvious Failures',
xaxis_title='Cost of the Product',
yaxis_title='Customer Rating',
)
fig.show()

7/14/23, 10:47 PM E_Commerce
import plotly.subplots as sp
# Select the relevant columns for analysis
# For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache
df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N']
# Filter failure cases
failure_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1
# Create visualizations for the failure cases
# Example 1: Box Plot of Customer Rating
plt.boxplot(failure_cases['Customer_rating'])
plt.xlabel('Failure Cases')
plt.ylabel('Customer_rating')
plt.title('Distribution of Customer Ratings for Failure Cases')
plt.show()
# Example 2: Scatter Plot of Cost of the Product vs. Customer Rating
x=failure_cases['Cost_of_the_Product'],
y=failure_cases['Customer_rating'],
mode='markers',
marker=dict(color='red')

7/14/23, 10:47 PM E_Commerce
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating for Failure Cases',
)
fig.show()

7/14/23, 10:47 PM E_Commerce
# For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache
df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N']
# Filter border cases
border_cases = df_selected[
(df_selected['Reached.on.Time_Y.N'] == 0) | (df_selected['Reached.on.Time_Y.N'
] # Assuming 0 indicates successful delivery, and 1 indicates failure
# Create visualizations for the border cases
# Example 1: Scatter Plot of Cost of the Product vs. Customer Rating
x=border_cases['Cost_of_the_Product'],
y=border_cases['Customer_rating'],
mode='markers',
marker=dict(
color=border_cases['Reached.on.Time_Y.N'],
colorscale='Viridis',
showscale=True
)
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating (Border Cases)',

7/14/23, 10:47 PM E_Commerce
)
fig.show()
# Example 2: Pie Chart of Reached on Time vs. Not Reached on Time
counts = border_cases['Reached.on.Time_Y.N'].value_counts()
fig = go.Figure(data=go.Pie(labels=counts.index, values=counts.values))
fig.update_layout(
title='Distribution of Reached on Time vs. Not Reached on Time (Border Cases)',
)
fig.show()

7/14/23, 10:47 PM E_Commerce
# For example, let's consider 'Customer rating' and 'Reached on time'
df_selected = df[['Customer_rating', 'Reached.on.Time_Y.N']]
# Calculate the proportion of on-time deliveries for each customer rating
grouped_data = df_selected.groupby('Customer_rating')['Reached.on.Time_Y.N'].mean(
# Sort the data based on customer rating
grouped_data.sort_values(by='Customer_rating', inplace=True)
# Create a bar plot
plt.bar(grouped_data['Customer_rating'], grouped_data['Reached.on.Time_Y.N'], color
plt.xlabel('Customer_Rating')
plt.ylabel('Proportion of Product Delivered on Time')
plt.title('Proportion of Product Delivered on Time by Customer Rating')
plt.show()

7/14/23, 10:47 PM E_Commerce
Percentage of Customer Ratings with Products Delivered on Time: 40.33%
# Calculate the percentage of customer ratings with products delivered on time
total_ratings = df['Customer_rating'].count()
on_time_ratings = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].count() #
percentage_on_time = (on_time_ratings / total_ratings) * 100
print("Percentage of Customer Ratings with Products Delivered on Time: {:.2f}%".for
# Calculate the percentage of products delivered on time for each customer rating
rating_counts = df['Customer_rating'].value_counts()
on_time_counts = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].value_counts
# Create a DataFrame with customer ratings and corresponding percentages
rating_percentages = (on_time_counts / rating_counts) * 100
rating_table = pd.DataFrame({'Customer_rating': rating_percentages.index[:5],
'Percentage Delivered on Time': rating_percentages.val
# Set the Customer Rating column as the index
rating_table.set_index('Customer_rating', inplace=True)
# Display the table
print(rating_table)

7/14/23, 10:47 PM E_Commerce
Percentage Delivered on Time
Customer_rating
1 41.252796
2 41.200924
3 39.392586
4 40.475103
5 39.336711
Customer_care_calls 2 3 4 5 6
Customer_rating
1 0.630769 0.633968 0.574530 0.564155 0.508021
2 0.733871 0.617978 0.582985 0.575372 0.472826
3 0.616000 0.644817 0.612903 0.596413 0.509346
4 0.666667 0.600000 0.607242 0.591111 0.525822
5 0.617647 0.628099 0.609898 0.595745 0.558140
Customer_care_calls 7
Customer_rating
1 0.547170
2 0.456522
3 0.481481
4 0.485714
5 0.586207
# Group the data by customer rating and customer care calls
grouped_data = df.groupby(['Customer_rating', 'Customer_care_calls'])['Reached.on.T
# Pivot the data to create a table-like format
pivot_table = grouped_data.pivot(index='Customer_rating', columns='Customer_care_ca
# Plot the percentage graph
pivot_table.plot(kind='bar', stacked=True)
plt.xlabel('Customer_rating')
plt.ylabel('Percentage of Products Delivered on Time')
plt.title('Percentage of Products Delivered on Time by Customer Rating and Customer
plt.legend(title='Customer_care_calls')
plt.show()
# Display the table
print(pivot_table)

7/14/23, 10:47 PM E_Commerce
In [ ]:

E_Commerce Data model

Recommended

Recommended

More Related Content

Similar to E_Commerce Data model

Similar to E_Commerce Data model (20)

Recently uploaded

Recently uploaded (20)

E_Commerce Data model