SlideShare a Scribd company logo
1 of 38
Download to read offline
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 1/38
Numeric variables count: 8
Categorical variables count: 4
In [2]: import pandas as pd
# Uploading the actual file
df = pd.read_csv('E_Commerce.csv')
In [3]: # Calculate the count of numeric variables
numeric_count = df.select_dtypes(include='number').shape[1]
# Calculate the count of categorical variables
categorical_count = df.select_dtypes(include='object').shape[1]
print("Numeric variables count:", numeric_count)
print("Categorical variables count:", categorical_count)
In [9]: data_dict = pd.DataFrame(columns=['Variable Name', 'Description', 'Type'])
# Add variable information to the data dictionary
data_dict['Variable Name'] = df.columns
data_dict['Description'] = ['ID Number of Customers',
'The Company's warehouse block (A, B, C, D, E)',
'The shipping mode (Ship, Flight, Road)',
'Number of calls made by the customer for shipment inqu
'Customer rating (1 to 5, with 1 being the lowest)',
'Cost of the product in US Dollars',
'Number of prior purchases',
'Importance of the product (low, medium, high)',
'Gender of the customer (Male, Female)',
'Discount offered on the product',
'Weight of the product in grams',
'Target variable indicating whether the product reached
# Determine the type of each variable
data_dict['Type'] = df.dtypes.values
# Display the data dictionary
data_dict
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 2/38
Variable Name Description Type
0 ID ID Number of Customers int64
1 Warehouse_block The Company's warehouse block (A, B, C, D, E) object
2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) object
3 Customer_care_calls Number of calls made by the customer for shipm... int64
4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) int64
5 Cost_of_the_Product Cost of the product in US Dollars int64
6 Prior_purchases Number of prior purchases int64
7 Product_importance Importance of the product (low, medium, high) object
8 Gender Gender of the customer (Male, Female) object
9 Discount_offered Discount offered on the product int64
10 Weight_in_gms Weight of the product in grams int64
11 Reached.on.Time_Y.N Target variable indicating whether the product... int64
Variable Name Description Type
0 ID ID Number of Customers Numerical
1 Warehouse_block The Company's warehouse block (A, B, C, D, E) Categorical
2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) Categorical
3 Customer_care_calls Number of calls made by the customer for shipm... Numerical
4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) Numerical
5 Cost_of_the_Product Cost of the product in US Dollars Numerical
6 Prior_purchases Number of prior purchases Numerical
7 Product_importance Importance of the product (low, medium, high) Categorical
8 Gender Gender of the customer (Male, Female) Categorical
9 Discount_offered Discount offered on the product Numerical
10 Weight_in_gms Weight of the product in grams Numerical
11 Reached.on.Time_Y.N Target variable indicating whether the product... Numerical
Out[9]:
In [10]: # Determine the type of each variable
data_dict['Type'] = df.dtypes.replace({'int64': 'Numerical', 'object': 'Categorical
# Display the data dictionary
data_dict
Out[10]:
In [11]: # Count of missing/null values
missing_values_count = df.isnull().sum()
# Redundant columns
redundant_columns = []
# Identify redundant columns (columns with constant values)
for column in df.columns:
if df[column].nunique() == 1:
redundant_columns.append(column)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 3/38
Count of missing/null values:
ID 0
Warehouse_block 0
Mode_of_Shipment 0
Customer_care_calls 0
Customer_rating 0
Cost_of_the_Product 0
Prior_purchases 0
Product_importance 0
Gender 0
Discount_offered 0
Weight_in_gms 0
Reached.on.Time_Y.N 0
dtype: int64
Redundant columns:
[]
# Print the count of missing/null values
print("Count of missing/null values:")
print(missing_values_count)
# Print the redundant columns
print("nRedundant columns:")
print(redundant_columns)
In [13]: import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read the dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Relationship between variables
sns.pairplot(df)
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 4/38
In [14]: # Check for multicollinearity
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix")
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 5/38
In [15]: # Distribution of variables
df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()
In [16]: # Presence of outliers
df.boxplot(figsize=(10, 8))
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 6/38
In [17]: # Statistical significance of variables
# Class imbalance
sns.countplot(x='Reached.on.Time_Y.N', data=df)
plt.show()
In [21]: import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
# Read the dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Transformation of Numerical Features
df['transformed_weight'] = np.sqrt(df['Weight_in_gms'])
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 7/38
In [22]: # Scaling the Data
scaler = StandardScaler()
df['scaled_weight'] = scaler.fit_transform(df[['Weight_in_gms']])
In [ ]: # Feature Selection
selector = SelectKBest(score_func=chi2, k=5) # Select top 5 features
selected_features = selector.fit_transform(df[['Customer_care_calls', 'Customer_rat
In [23]: # Dimensionality Reduction
pca = PCA(n_components=2) # Reduce to 2 principal components
reduced_features = pca.fit_transform(df[['Customer_care_calls', 'Customer_rating',
In [24]: # Print the updated DataFrame with transformed, scaled, selected, and reduced featu
print(df)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 8/38
ID Warehouse_block Mode_of_Shipment Customer_care_calls 
0 1 D Flight 4
1 2 F Flight 4
2 3 A Flight 2
3 4 B Flight 3
4 5 C Flight 2
... ... ... ... ...
10994 10995 A Ship 4
10995 10996 B Ship 4
10996 10997 C Ship 5
10997 10998 F Ship 5
10998 10999 D Ship 2
Customer_rating Cost_of_the_Product Prior_purchases 
0 2 177 3
1 5 216 2
2 2 183 4
3 3 176 4
4 2 184 3
... ... ... ...
10994 1 252 5
10995 1 232 5
10996 4 242 5
10997 2 223 6
10998 5 155 5
Product_importance Gender Discount_offered Weight_in_gms 
0 low F 44 1233
1 low M 59 3088
2 low M 48 3374
3 medium M 10 1177
4 medium F 46 2484
... ... ... ... ...
10994 medium F 1 1538
10995 medium F 6 1247
10996 low F 4 1155
10997 medium M 2 1210
10998 low F 6 1639
Reached.on.Time_Y.N transformed_weight scaled_weight
0 1 35.114100 -1.468240
1 1 55.569776 -0.333893
2 1 58.086143 -0.159002
3 1 34.307434 -1.502484
4 1 49.839743 -0.703244
... ... ... ...
10994 1 39.217343 -1.281730
10995 0 35.312887 -1.459679
10996 0 33.985291 -1.515937
10997 0 34.785054 -1.482304
10998 0 40.484565 -1.219968
[10999 rows x 14 columns]
In [25]: from sklearn.linear_model import LinearRegression
# Create the SLR model
model_slr = LinearRegression()
# Prepare the features and target variables
X_slr = df[['Cost_of_the_Product']]
y_slr = df['Weight_in_gms']
# Fit the model
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 9/38
R-squared: 0.01758383342649017
Mean Squared Error: 2627192.6380324196
model_slr.fit(X_slr, y_slr)
# Predict the target variable
y_pred_slr = model_slr.predict(X_slr)
In [26]: from sklearn.linear_model import LinearRegression
# Create the Multiple Linear Regression model
model_multiple = LinearRegression()
# Prepare the features and target variables
X_multiple = df[['Cost_of_the_Product', 'Prior_purchases']]
y_multiple = df['Weight_in_gms']
# Fit the model
model_multiple.fit(X_multiple, y_multiple)
# Predict the target variable
y_pred_multiple = model_multiple.predict(X_multiple)
In [27]: from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# Create the SLR model
model_slr = LinearRegression()
# Prepare the features and target variables
X_slr = df[['Cost_of_the_Product']]
y_slr = df['Weight_in_gms']
# Fit the model
model_slr.fit(X_slr, y_slr)
# Predict the target variable
y_pred_slr = model_slr.predict(X_slr)
# Calculate R-squared
r2 = r2_score(y_slr, y_pred_slr)
# Calculate MSE
mse = mean_squared_error(y_slr, y_pred_slr)
# Print the R-squared and MSE values
print("R-squared:", r2)
print("Mean Squared Error:", mse)
In [29]: import matplotlib.pyplot as plt
# Extract the column values for plotting
x_values = df['Cost_of_the_Product']
y_actual = df['Weight_in_gms']
# Plot the actual data points
plt.scatter(x_values, y_actual, color='blue', label='Actual')
# Plot the regression line
plt.plot(x_values, y_pred_slr, color='red', label='Regression Line')
# Set plot labels and title
plt.xlabel('Cost of the Product')
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 10/38
plt.ylabel('Weight in grams')
plt.title('Simple Linear Regression')
# Display legend
plt.legend()
# Show the plot
plt.show()
In [42]: import matplotlib.pyplot as plt
# Plot the actual values
plt.scatter(X_multiple['Cost_of_the_Product'], y_multiple, color='blue', label='Act
# Plot the predicted values
plt.scatter(X_multiple['Cost_of_the_Product'], y_pred_multiple, color='red', label=
# Add the Prior_purchases column to the plot
plt.scatter(X_multiple['Cost_of_the_Product'], X_multiple['Prior_purchases'], color
# Set plot labels and title
plt.xlabel('Cost_of_the_Product')
plt.ylabel('Weight_in_gms')
plt.title('Actual vs Predicted Values')
# Add a legend
plt.legend()
# Display the plot
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 11/38
In [46]: from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# Create the Logistic Regression model
model_logistic = LogisticRegression()
# Prepare the features and target variables
X_logistic = df[['Cost_of_the_Product', 'Prior_purchases']]
y_logistic = df['Reached.on.Time_Y.N']
# Fit the model
model_logistic.fit(X_logistic, y_logistic)
# Predict the target variable
y_pred_logistic = model_logistic.predict(X_logistic)
# Create a scatter plot of the predicted values
plt.scatter(range(len(y_pred_logistic)), y_pred_logistic, color='blue', label='Pred
# Plot the actual values
plt.scatter(range(len(y_logistic)), y_logistic, color='red', label='Actual')
# Set plot labels and title
plt.xlabel('Data Point')
plt.ylabel('Reached on Time')
plt.title('Logistic Regression Predictions')
# Add a legend
plt.legend()
# Show the plot
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 12/38
Accuracy: 0.5545454545454546
In [52]: from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Prepare the features and target variables
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the Decision Tree classifier
model_decision_tree = DecisionTreeClassifier()
# Fit the model on the training data
model_decision_tree.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_decision_tree.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [53]: from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Prepare the features and target variables
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the Random Forest classifier
model_random_forest = RandomForestClassifier()
# Fit the model on the training data
model_random_forest.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_random_forest.predict(X_test)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 13/38
Accuracy: 0.5622727272727273
Accuracy: 0.5931818181818181
Accuracy: 0.5636363636363636
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [54]: from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Prepare the features and target variables
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the SVM classifier
model_svm = SVC()
# Fit the model on the training data
model_svm.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_svm.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [55]: from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Prepare the features and target variables
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the base classifier
base_classifier = DecisionTreeClassifier()
# Create the bagging classifier
model_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=10,
# Fit the model on the training data
model_bagging.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_bagging.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [56]: from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 14/38
Accuracy: 0.5645454545454546
from sklearn.metrics import accuracy_score
# Prepare the features and target variables
X = df[['Cost_of_the_Product', 'Prior_purchases']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create the base classifier
base_classifier = DecisionTreeClassifier()
# Create the AdaBoost classifier
model_adaboost = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=10
# Fit the model on the training data
model_adaboost.fit(X_train, y_train)
# Predict the target variable for the test data
y_pred = model_adaboost.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [57]: from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Prepare the features
X = df[['Cost_of_the_Product', 'Prior_purchases']]
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Plot the data points in the reduced dimension space
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA')
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 15/38
In [58]: from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Prepare the features
X = df[['Cost_of_the_Product', 'Prior_purchases']]
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Instantiate the K-Means clustering algorithm
kmeans = KMeans(n_clusters=3, random_state=42)
# Fit the model on the scaled data
kmeans.fit(X_scaled)
# Predict the cluster labels
labels = kmeans.predict(X_scaled)
# Plot the data points with color-coded clusters
plt.scatter(X['Cost_of_the_Product'], X['Prior_purchases'], c=labels)
plt.xlabel('Cost of the Product')
plt.ylabel('Prior Purchases')
plt.title('K-Means Clustering')
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 16/38
Accuracy: 0.5895454545454546
In [3]: import pandas as pd
from sklearn.model_selection import train_test_split
# Read the dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
In [4]: import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [5]: import numpy as np
# Fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the training set
y_train_pred = model.predict(X_train)
# Predict on the test set
y_test_pred = model.predict(X_test)
# Calculate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
# Calculate the bias error
bias_error = np.mean(np.abs(y_train_pred - y_train))
# Calculate the variance error
variance_error = np.mean(np.abs(y_test_pred - y_test))
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Bias Error:", bias_error)
print("Variance Error:", variance_error)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 17/38
Training Accuracy: 0.599613592453688
Test Accuracy: 0.5895454545454546
Bias Error: 0.4003864075463121
Variance Error: 0.41045454545454546
Cross-Validation Scores: [0.59727273 0.59681818 0.59681818 0.55954545 0.55025011]
Mean CV Accuracy: 0.5801409318285171
Ensemble Model Accuracy: 0.5281818181818182
Best Parameters: {'C': 0.1}
Best Model Accuracy: 0.5895454545454546
In [6]: from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the logistic regression model
model = LogisticRegression()
# Perform cross-validation
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cross_val_scores)
print("Mean CV Accuracy:", cross_val_scores.mean())
In [7]: # Use ensemble models (Random Forest)
ensemble_model = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_model.fit(X_train, y_train)
ensemble_accuracy = ensemble_model.score(X_test, y_test)
print("Ensemble Model Accuracy:", ensemble_accuracy)
In [8]: # Perform hyperparameter tuning using GridSearchCV
parameters = {'C': [0.1, 1, 10]}
grid_search = GridSearchCV(model, parameters, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
In [9]: # Use the best parameters to fit the model
best_model = LogisticRegression(C=best_params['C'])
best_model.fit(X_train, y_train)
best_model_accuracy = best_model.score(X_test, y_test)
print("Best Model Accuracy:", best_model_accuracy)
In [11]: import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
# Read the dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for regression
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the dependent variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 18/38
RMSE: 0.49030942756132945
R-squared: 0.0037867760725102118
Classification Report:
precision recall f1-score support
0 0.43 0.03 0.05 895
1 0.59 0.98 0.74 1305
accuracy 0.59 2200
macro avg 0.51 0.50 0.39 2200
weighted avg 0.53 0.59 0.46 2200
AUC-ROC Score: 0.5386027098182752
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# Calculate R-squared value
r2 = r2_score(y_test, y_pred)
# Print the evaluation metrics
print("RMSE:", rmse)
print("R-squared:", r2)
In [13]: import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
# Read the dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for classification
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of positive class
# Generate classification report
classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)
# Calculate AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_roc)
In [15]: import pandas as pd
from sklearn.cluster import KMeans
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 19/38
Inertia Value: 7956.899346042599
from sklearn.preprocessing import StandardScaler
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for clustering
# For example, let's consider 'Customer care calls' and 'Cost of the product' as th
X = df[['Customer_care_calls', 'Cost_of_the_Product']]
# Perform feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Create and fit the K-means clustering model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)
# Calculate the inertia value
inertia = kmeans.inertia_
# Print the inertia value
print("Inertia Value:", inertia)
In [16]: import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Define the models to compare
models = [
('Logistic Regression', LogisticRegression()),
('Decision Tree', DecisionTreeClassifier()),
('Random Forest', RandomForestClassifier())
]
# Iterate over the models
for model_name, model in models:
# Fit the model on the training data
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 20/38
Model: Logistic Regression
Accuracy: 0.5895454545454546
Precision: 0.5936626281453867
Recall: 0.9762452107279693
F1-score: 0.7383367139959431
-------------------------
Model: Decision Tree
Accuracy: 0.5109090909090909
Precision: 0.5984522785898538
Recall: 0.5333333333333333
F1-score: 0.5640194489465155
-------------------------
Model: Random Forest
Accuracy: 0.5322727272727272
Precision: 0.5997109826589595
Recall: 0.6360153256704981
F1-score: 0.6173298624023801
-------------------------
# Print the results
print(f"Model: {model_name}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print("-------------------------")
In [17]: import time
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create and fit the logistic regression model
model = LogisticRegression()
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()
# Calculate the time taken for model training
training_time = end_time - start_time
print("Training Time:", training_time)
# Make predictions on the test set
start_time = time.time()
y_pred = model.predict(X_test)
end_time = time.time()
# Calculate the time taken for making predictions
prediction_time = end_time - start_time
print("Prediction Time:", prediction_time)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 21/38
Training Time: 0.04000043869018555
Prediction Time: 0.002998828887939453
In [18]: import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for EDA
# For example, let's consider 'Customer care calls', 'Customer rating', 'Cost of th
selected_columns = ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product
df_selected = df[selected_columns]
# Correlation Matrix
correlation_matrix = df_selected.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
In [20]: # Pair Plots
sns.pairplot(df_selected, hue='Reached.on.Time_Y.N', diag_kind='kde')
plt.title('Pair Plots')
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 22/38
In [21]: # Box Plots
plt.figure(figsize=(10, 6))
for i, column in enumerate(selected_columns[:-1]):
plt.subplot(2, 2, i+1)
sns.boxplot(x='Reached.on.Time_Y.N', y=column, data=df_selected)
plt.xlabel('Reached.on.Time_Y.N')
plt.ylabel(column)
plt.tight_layout()
plt.suptitle('Box Plots', y=1.05)
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 23/38
In [22]: # Distribution Plots
plt.figure(figsize=(10, 6))
for i, column in enumerate(selected_columns[:-1]):
plt.subplot(2, 2, i+1)
sns.histplot(data=df_selected, x=column, hue='Reached.on.Time_Y.N', kde=True)
plt.xlabel(column)
plt.ylabel('Count')
plt.tight_layout()
plt.suptitle('Distribution Plots', y=1.05)
plt.show()
In [23]: import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 24/38
Accuracy: 0.5895454545454546
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
In [5]: import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
# Model Parameters
model_params = {
'solver': 'lbfgs',
'C': 1.0,
'max_iter': 100,
# Add other model parameters as needed
}
# Create a bar chart for model parameters
fig = px.bar(
x=list(model_params.keys()),
y=list(model_params.values()),
labels={'x': 'Parameter', 'y': 'Value'},
title='Model Parameters'
)
fig.show()
# Create a table for model parameters
table_data = [['Parameter', 'Value']]
table_data.extend(list(model_params.items()))
fig = go.Figure(data=[go.Table(header=dict(values=table_data[0]), cells=dict(values
fig.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 25/38
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 26/38
In [25]: import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Define a range of parameter values to test
parameter_values = [0.001, 0.01, 0.1, 1, 10, 100]
# Initialize lists to store parameter values and corresponding accuracy scores
parameters = []
accuracy_scores = []
# Iterate over the parameter values
for param in parameter_values:
# Create and fit the logistic regression model with the current parameter value
model = LogisticRegression(C=param)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 27/38
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate accuracy and append to the lists
accuracy = accuracy_score(y_test, y_pred)
parameters.append(param)
accuracy_scores.append(accuracy)
# Plot the performance with varying parameters
plt.plot(parameters, accuracy_scores, marker='o')
plt.xlabel('Parameter Value')
plt.ylabel('Accuracy')
plt.title('Model Performance with Varying Parameters')
plt.xticks(np.arange(min(parameters), max(parameters) + 1, 1.0))
plt.show()
In [27]: import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Generate classification report
classification_metrics = classification_report(y_test, y_pred, output_dict=True)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 28/38
Metrics not available for class label: accuracy
# Extract metrics and class labels
metrics = ['precision', 'recall', 'f1-score', 'support']
class_labels = list(classification_metrics.keys())[:-1] # Exclude 'macro avg' and
# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame(index=class_labels, columns=metrics)
for label in class_labels:
if isinstance(classification_metrics[label], dict):
metrics_df.loc[label] = [classification_metrics[label][metric] for metric i
else:
print(f"Metrics not available for class label: {label}")
# Plot the metrics
sns.set_style("whitegrid")
metrics_df.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Class Labels')
plt.ylabel('Metric Score')
plt.title('Model Metrics - Classification Report')
plt.legend(title='Metrics', bbox_to_anchor=(1, 1))
plt.show()
In [28]: import pandas as pd
from sklearn.linear_model import LogisticRegression
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X, y)
# Predict the target variable for the entire dataset
y_pred = model.predict(X)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 29/38
Reached.on.Time_Y.N Predicted Reached on time
0 1 1
1 1 1
2 1 1
3 1 1
4 1 1
... ... ...
10994 1 1
10995 0 1
10996 0 1
10997 0 1
10998 0 1
[10999 rows x 2 columns]
Percentage of correct predictions: 59.6872442949359
# Add the predicted values to the DataFrame
df['Predicted Reached on time'] = y_pred
# Print the testing outcome of the whole E-commerce dataset
print(df[['Reached.on.Time_Y.N', 'Predicted Reached on time']])
In [29]: import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for modeling
# For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o
# and 'Reached on time' as the target variable
X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']]
y = df['Reached.on.Time_Y.N']
# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X, y)
# Predict the target variable for the entire dataset
y_pred = model.predict(X)
# Calculate the accuracy
accuracy = accuracy_score(y, y_pred)
percentage_correct = accuracy * 100
# Print the percentage of correct predictions
print("Percentage of correct predictions:", percentage_correct)
In [3]: import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# Read the E-commerce dat# Filter successful and obvious cases
success_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 0] # Assuming 0
obvious_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1
# Create visualizations for the successful cases
# Example: Histogram of Customer Ratings
plt.figure(figsize=(8, 6))
plt.hist(success_cases['Customer_rating'], bins=10, alpha=0.5, color='green')
plt.xlabel('Customer_rating')
plt.ylabel('Count')
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 30/38
plt.title('Distribution of Customer_ratings for Successful Deliveries')
plt.show()
# Create visualizations for the obvious cases
# Example: Scatter Plot of Cost of the Product vs. Customer Rating
fig = go.Figure(data=go.Scatter(
x=obvious_cases['Cost_of_the_Product'],
y=obvious_cases['Customer_rating'],
mode='markers',
marker=dict(color='red')
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating for Obvious Failures',
xaxis_title='Cost of the Product',
yaxis_title='Customer Rating',
)
fig.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 31/38
In [2]: import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.subplots as sp
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for analysis
# For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache
df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N']
# Filter failure cases
failure_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1
# Create visualizations for the failure cases
# Example 1: Box Plot of Customer Rating
plt.figure(figsize=(8, 6))
plt.boxplot(failure_cases['Customer_rating'])
plt.xlabel('Failure Cases')
plt.ylabel('Customer_rating')
plt.title('Distribution of Customer Ratings for Failure Cases')
plt.show()
# Example 2: Scatter Plot of Cost of the Product vs. Customer Rating
fig = go.Figure(data=go.Scatter(
x=failure_cases['Cost_of_the_Product'],
y=failure_cases['Customer_rating'],
mode='markers',
marker=dict(color='red')
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 32/38
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating for Failure Cases',
xaxis_title='Cost of the Product',
yaxis_title='Customer Rating',
)
fig.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 33/38
In [4]: import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for analysis
# For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache
df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N']
# Filter border cases
border_cases = df_selected[
(df_selected['Reached.on.Time_Y.N'] == 0) | (df_selected['Reached.on.Time_Y.N'
] # Assuming 0 indicates successful delivery, and 1 indicates failure
# Create visualizations for the border cases
# Example 1: Scatter Plot of Cost of the Product vs. Customer Rating
fig = go.Figure(data=go.Scatter(
x=border_cases['Cost_of_the_Product'],
y=border_cases['Customer_rating'],
mode='markers',
marker=dict(
color=border_cases['Reached.on.Time_Y.N'],
colorscale='Viridis',
showscale=True
)
))
fig.update_layout(
title='Cost of the Product vs. Customer Rating (Border Cases)',
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 34/38
xaxis_title='Cost of the Product',
yaxis_title='Customer Rating',
)
fig.show()
# Example 2: Pie Chart of Reached on Time vs. Not Reached on Time
counts = border_cases['Reached.on.Time_Y.N'].value_counts()
fig = go.Figure(data=go.Pie(labels=counts.index, values=counts.values))
fig.update_layout(
title='Distribution of Reached on Time vs. Not Reached on Time (Border Cases)',
)
fig.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 35/38
In [7]: import pandas as pd
import matplotlib.pyplot as plt
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Select the relevant columns for analysis
# For example, let's consider 'Customer rating' and 'Reached on time'
df_selected = df[['Customer_rating', 'Reached.on.Time_Y.N']]
# Calculate the proportion of on-time deliveries for each customer rating
grouped_data = df_selected.groupby('Customer_rating')['Reached.on.Time_Y.N'].mean(
# Sort the data based on customer rating
grouped_data.sort_values(by='Customer_rating', inplace=True)
# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(grouped_data['Customer_rating'], grouped_data['Reached.on.Time_Y.N'], color
plt.xlabel('Customer_Rating')
plt.ylabel('Proportion of Product Delivered on Time')
plt.title('Proportion of Product Delivered on Time by Customer Rating')
plt.show()
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 36/38
Percentage of Customer Ratings with Products Delivered on Time: 40.33%
In [8]: import pandas as pd
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Calculate the percentage of customer ratings with products delivered on time
total_ratings = df['Customer_rating'].count()
on_time_ratings = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].count() #
percentage_on_time = (on_time_ratings / total_ratings) * 100
print("Percentage of Customer Ratings with Products Delivered on Time: {:.2f}%".for
In [9]: import pandas as pd
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Calculate the percentage of products delivered on time for each customer rating
rating_counts = df['Customer_rating'].value_counts()
on_time_counts = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].value_counts
# Create a DataFrame with customer ratings and corresponding percentages
rating_percentages = (on_time_counts / rating_counts) * 100
rating_table = pd.DataFrame({'Customer_rating': rating_percentages.index[:5],
'Percentage Delivered on Time': rating_percentages.val
# Set the Customer Rating column as the index
rating_table.set_index('Customer_rating', inplace=True)
# Display the table
print(rating_table)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 37/38
Percentage Delivered on Time
Customer_rating
1 41.252796
2 41.200924
3 39.392586
4 40.475103
5 39.336711
Customer_care_calls 2 3 4 5 6 
Customer_rating
1 0.630769 0.633968 0.574530 0.564155 0.508021
2 0.733871 0.617978 0.582985 0.575372 0.472826
3 0.616000 0.644817 0.612903 0.596413 0.509346
4 0.666667 0.600000 0.607242 0.591111 0.525822
5 0.617647 0.628099 0.609898 0.595745 0.558140
Customer_care_calls 7
Customer_rating
1 0.547170
2 0.456522
3 0.481481
4 0.485714
5 0.586207
In [13]: import pandas as pd
import matplotlib.pyplot as plt
# Read the E-commerce dataset into a pandas DataFrame
df = pd.read_csv('E_Commerce.csv')
# Group the data by customer rating and customer care calls
grouped_data = df.groupby(['Customer_rating', 'Customer_care_calls'])['Reached.on.T
# Pivot the data to create a table-like format
pivot_table = grouped_data.pivot(index='Customer_rating', columns='Customer_care_ca
# Plot the percentage graph
pivot_table.plot(kind='bar', stacked=True)
plt.xlabel('Customer_rating')
plt.ylabel('Percentage of Products Delivered on Time')
plt.title('Percentage of Products Delivered on Time by Customer Rating and Customer
plt.legend(title='Customer_care_calls')
plt.show()
# Display the table
print(pivot_table)
7/14/23, 10:47 PM E_Commerce
file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 38/38
In [ ]:

More Related Content

Similar to E_Commerce

Getting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdfGetting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdfSudhakarVenkey
 
Customer Clustering for Retailer Marketing
Customer Clustering for Retailer MarketingCustomer Clustering for Retailer Marketing
Customer Clustering for Retailer MarketingJonathan Sedar
 
Customer Clustering For Retail Marketing
Customer Clustering For Retail MarketingCustomer Clustering For Retail Marketing
Customer Clustering For Retail MarketingJonathan Sedar
 
Informatics Practices/ Information Practices Project (IP Project Class 12)
Informatics Practices/ Information Practices Project (IP Project Class 12)Informatics Practices/ Information Practices Project (IP Project Class 12)
Informatics Practices/ Information Practices Project (IP Project Class 12)KushShah65
 
Simplify Feature Engineering in Your Data Warehouse
Simplify Feature Engineering in Your Data WarehouseSimplify Feature Engineering in Your Data Warehouse
Simplify Feature Engineering in Your Data WarehouseFeatureByte
 
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADataconomy Media
 
[open source] hamilton, a micro framework for creating dataframes, and its ap...
[open source] hamilton, a micro framework for creating dataframes, and its ap...[open source] hamilton, a micro framework for creating dataframes, and its ap...
[open source] hamilton, a micro framework for creating dataframes, and its ap...Stefan Krawczyk
 
Optimization in django orm
Optimization in django ormOptimization in django orm
Optimization in django ormDenys Levchenko
 
Python Programming.pptx
Python Programming.pptxPython Programming.pptx
Python Programming.pptxSudhakarVenkey
 
ADBMS ASSIGNMENT
ADBMS ASSIGNMENTADBMS ASSIGNMENT
ADBMS ASSIGNMENTLori Moore
 
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its author
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its authorKaggle Winning Solution Xgboost algorithm -- Let us learn from its author
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its authorVivian S. Zhang
 
Worksheet - python Pandas numerical py pdf
Worksheet - python Pandas numerical py pdfWorksheet - python Pandas numerical py pdf
Worksheet - python Pandas numerical py pdfudaywalnandini
 
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdfNeed an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdfactexerode
 
Database Development Replication Security Maintenance Report
Database Development Replication Security Maintenance ReportDatabase Development Replication Security Maintenance Report
Database Development Replication Security Maintenance Reportnyin27
 
Lecture 1 Pandas Basics.pptx machine learning
Lecture 1 Pandas Basics.pptx machine learningLecture 1 Pandas Basics.pptx machine learning
Lecture 1 Pandas Basics.pptx machine learningmy6305874
 

Similar to E_Commerce (20)

Getting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdfGetting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdf
 
12Structures.pptx
12Structures.pptx12Structures.pptx
12Structures.pptx
 
Customer Clustering for Retailer Marketing
Customer Clustering for Retailer MarketingCustomer Clustering for Retailer Marketing
Customer Clustering for Retailer Marketing
 
Customer Clustering For Retail Marketing
Customer Clustering For Retail MarketingCustomer Clustering For Retail Marketing
Customer Clustering For Retail Marketing
 
Informatics Practices/ Information Practices Project (IP Project Class 12)
Informatics Practices/ Information Practices Project (IP Project Class 12)Informatics Practices/ Information Practices Project (IP Project Class 12)
Informatics Practices/ Information Practices Project (IP Project Class 12)
 
Simplify Feature Engineering in Your Data Warehouse
Simplify Feature Engineering in Your Data WarehouseSimplify Feature Engineering in Your Data Warehouse
Simplify Feature Engineering in Your Data Warehouse
 
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
 
[open source] hamilton, a micro framework for creating dataframes, and its ap...
[open source] hamilton, a micro framework for creating dataframes, and its ap...[open source] hamilton, a micro framework for creating dataframes, and its ap...
[open source] hamilton, a micro framework for creating dataframes, and its ap...
 
Customer analytics for e commerce
Customer analytics for e commerceCustomer analytics for e commerce
Customer analytics for e commerce
 
Optimization in django orm
Optimization in django ormOptimization in django orm
Optimization in django orm
 
Python Programming.pptx
Python Programming.pptxPython Programming.pptx
Python Programming.pptx
 
Final project kijtorntham n
Final project kijtorntham nFinal project kijtorntham n
Final project kijtorntham n
 
ADBMS ASSIGNMENT
ADBMS ASSIGNMENTADBMS ASSIGNMENT
ADBMS ASSIGNMENT
 
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its author
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its authorKaggle Winning Solution Xgboost algorithm -- Let us learn from its author
Kaggle Winning Solution Xgboost algorithm -- Let us learn from its author
 
interenship.pptx
interenship.pptxinterenship.pptx
interenship.pptx
 
More on Pandas.pptx
More on Pandas.pptxMore on Pandas.pptx
More on Pandas.pptx
 
Worksheet - python Pandas numerical py pdf
Worksheet - python Pandas numerical py pdfWorksheet - python Pandas numerical py pdf
Worksheet - python Pandas numerical py pdf
 
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdfNeed an detailed analysis of what this code-model is doing- Thanks #St.pdf
Need an detailed analysis of what this code-model is doing- Thanks #St.pdf
 
Database Development Replication Security Maintenance Report
Database Development Replication Security Maintenance ReportDatabase Development Replication Security Maintenance Report
Database Development Replication Security Maintenance Report
 
Lecture 1 Pandas Basics.pptx machine learning
Lecture 1 Pandas Basics.pptx machine learningLecture 1 Pandas Basics.pptx machine learning
Lecture 1 Pandas Basics.pptx machine learning
 

Recently uploaded

Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...gajnagarg
 
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...HyderabadDolls
 
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...nirzagarg
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...nirzagarg
 
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...nirzagarg
 
Predictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesPredictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesBoston Institute of Analytics
 
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxRESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxronsairoathenadugay
 
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...gajnagarg
 
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...HyderabadDolls
 
Vastral Call Girls Book Now 7737669865 Top Class Escort Service Available
Vastral Call Girls Book Now 7737669865 Top Class Escort Service AvailableVastral Call Girls Book Now 7737669865 Top Class Escort Service Available
Vastral Call Girls Book Now 7737669865 Top Class Escort Service Availablegargpaaro
 
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridih
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime GiridihGiridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridih
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridihmeghakumariji156
 
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...ThinkInnovation
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...nirzagarg
 
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...kumargunjan9515
 
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...HyderabadDolls
 
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...gajnagarg
 
Ranking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRanking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRajesh Mondal
 
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...gajnagarg
 

Recently uploaded (20)

Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In bhavnagar [ 7014168258 ] Call Me For Genuine Models...
 
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
 
Call Girls in G.T.B. Nagar (delhi) call me [🔝9953056974🔝] escort service 24X7
Call Girls in G.T.B. Nagar  (delhi) call me [🔝9953056974🔝] escort service 24X7Call Girls in G.T.B. Nagar  (delhi) call me [🔝9953056974🔝] escort service 24X7
Call Girls in G.T.B. Nagar (delhi) call me [🔝9953056974🔝] escort service 24X7
 
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
 
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Begusarai [ 7014168258 ] Call Me For Genuine Models...
 
Predictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesPredictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting Techniques
 
Abortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
Abortion pills in Doha {{ QATAR }} +966572737505) Get CytotecAbortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
Abortion pills in Doha {{ QATAR }} +966572737505) Get Cytotec
 
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxRESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
 
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
Top profile Call Girls In Chandrapur [ 7014168258 ] Call Me For Genuine Model...
 
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
 
Vastral Call Girls Book Now 7737669865 Top Class Escort Service Available
Vastral Call Girls Book Now 7737669865 Top Class Escort Service AvailableVastral Call Girls Book Now 7737669865 Top Class Escort Service Available
Vastral Call Girls Book Now 7737669865 Top Class Escort Service Available
 
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridih
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime GiridihGiridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridih
Giridih Escorts Service Girl ^ 9332606886, WhatsApp Anytime Giridih
 
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
 
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
 
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...
Gomti Nagar & best call girls in Lucknow | 9548273370 Independent Escorts & D...
 
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...
Top profile Call Girls In Nandurbar [ 7014168258 ] Call Me For Genuine Models...
 
Ranking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRanking and Scoring Exercises for Research
Ranking and Scoring Exercises for Research
 
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
 

E_Commerce

  • 1. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 1/38 Numeric variables count: 8 Categorical variables count: 4 In [2]: import pandas as pd # Uploading the actual file df = pd.read_csv('E_Commerce.csv') In [3]: # Calculate the count of numeric variables numeric_count = df.select_dtypes(include='number').shape[1] # Calculate the count of categorical variables categorical_count = df.select_dtypes(include='object').shape[1] print("Numeric variables count:", numeric_count) print("Categorical variables count:", categorical_count) In [9]: data_dict = pd.DataFrame(columns=['Variable Name', 'Description', 'Type']) # Add variable information to the data dictionary data_dict['Variable Name'] = df.columns data_dict['Description'] = ['ID Number of Customers', 'The Company's warehouse block (A, B, C, D, E)', 'The shipping mode (Ship, Flight, Road)', 'Number of calls made by the customer for shipment inqu 'Customer rating (1 to 5, with 1 being the lowest)', 'Cost of the product in US Dollars', 'Number of prior purchases', 'Importance of the product (low, medium, high)', 'Gender of the customer (Male, Female)', 'Discount offered on the product', 'Weight of the product in grams', 'Target variable indicating whether the product reached # Determine the type of each variable data_dict['Type'] = df.dtypes.values # Display the data dictionary data_dict
  • 2. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 2/38 Variable Name Description Type 0 ID ID Number of Customers int64 1 Warehouse_block The Company's warehouse block (A, B, C, D, E) object 2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) object 3 Customer_care_calls Number of calls made by the customer for shipm... int64 4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) int64 5 Cost_of_the_Product Cost of the product in US Dollars int64 6 Prior_purchases Number of prior purchases int64 7 Product_importance Importance of the product (low, medium, high) object 8 Gender Gender of the customer (Male, Female) object 9 Discount_offered Discount offered on the product int64 10 Weight_in_gms Weight of the product in grams int64 11 Reached.on.Time_Y.N Target variable indicating whether the product... int64 Variable Name Description Type 0 ID ID Number of Customers Numerical 1 Warehouse_block The Company's warehouse block (A, B, C, D, E) Categorical 2 Mode_of_Shipment The shipping mode (Ship, Flight, Road) Categorical 3 Customer_care_calls Number of calls made by the customer for shipm... Numerical 4 Customer_rating Customer rating (1 to 5, with 1 being the lowest) Numerical 5 Cost_of_the_Product Cost of the product in US Dollars Numerical 6 Prior_purchases Number of prior purchases Numerical 7 Product_importance Importance of the product (low, medium, high) Categorical 8 Gender Gender of the customer (Male, Female) Categorical 9 Discount_offered Discount offered on the product Numerical 10 Weight_in_gms Weight of the product in grams Numerical 11 Reached.on.Time_Y.N Target variable indicating whether the product... Numerical Out[9]: In [10]: # Determine the type of each variable data_dict['Type'] = df.dtypes.replace({'int64': 'Numerical', 'object': 'Categorical # Display the data dictionary data_dict Out[10]: In [11]: # Count of missing/null values missing_values_count = df.isnull().sum() # Redundant columns redundant_columns = [] # Identify redundant columns (columns with constant values) for column in df.columns: if df[column].nunique() == 1: redundant_columns.append(column)
  • 3. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 3/38 Count of missing/null values: ID 0 Warehouse_block 0 Mode_of_Shipment 0 Customer_care_calls 0 Customer_rating 0 Cost_of_the_Product 0 Prior_purchases 0 Product_importance 0 Gender 0 Discount_offered 0 Weight_in_gms 0 Reached.on.Time_Y.N 0 dtype: int64 Redundant columns: [] # Print the count of missing/null values print("Count of missing/null values:") print(missing_values_count) # Print the redundant columns print("nRedundant columns:") print(redundant_columns) In [13]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # Read the dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Relationship between variables sns.pairplot(df) plt.show()
  • 4. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 4/38 In [14]: # Check for multicollinearity correlation_matrix = df.corr() sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu") plt.title("Correlation Matrix") plt.show()
  • 5. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 5/38 In [15]: # Distribution of variables df.hist(figsize=(10, 8)) plt.tight_layout() plt.show() In [16]: # Presence of outliers df.boxplot(figsize=(10, 8)) plt.show()
  • 6. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 6/38 In [17]: # Statistical significance of variables # Class imbalance sns.countplot(x='Reached.on.Time_Y.N', data=df) plt.show() In [21]: import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest, chi2 from sklearn.decomposition import PCA # Read the dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Transformation of Numerical Features df['transformed_weight'] = np.sqrt(df['Weight_in_gms'])
  • 7. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 7/38 In [22]: # Scaling the Data scaler = StandardScaler() df['scaled_weight'] = scaler.fit_transform(df[['Weight_in_gms']]) In [ ]: # Feature Selection selector = SelectKBest(score_func=chi2, k=5) # Select top 5 features selected_features = selector.fit_transform(df[['Customer_care_calls', 'Customer_rat In [23]: # Dimensionality Reduction pca = PCA(n_components=2) # Reduce to 2 principal components reduced_features = pca.fit_transform(df[['Customer_care_calls', 'Customer_rating', In [24]: # Print the updated DataFrame with transformed, scaled, selected, and reduced featu print(df)
  • 8. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 8/38 ID Warehouse_block Mode_of_Shipment Customer_care_calls 0 1 D Flight 4 1 2 F Flight 4 2 3 A Flight 2 3 4 B Flight 3 4 5 C Flight 2 ... ... ... ... ... 10994 10995 A Ship 4 10995 10996 B Ship 4 10996 10997 C Ship 5 10997 10998 F Ship 5 10998 10999 D Ship 2 Customer_rating Cost_of_the_Product Prior_purchases 0 2 177 3 1 5 216 2 2 2 183 4 3 3 176 4 4 2 184 3 ... ... ... ... 10994 1 252 5 10995 1 232 5 10996 4 242 5 10997 2 223 6 10998 5 155 5 Product_importance Gender Discount_offered Weight_in_gms 0 low F 44 1233 1 low M 59 3088 2 low M 48 3374 3 medium M 10 1177 4 medium F 46 2484 ... ... ... ... ... 10994 medium F 1 1538 10995 medium F 6 1247 10996 low F 4 1155 10997 medium M 2 1210 10998 low F 6 1639 Reached.on.Time_Y.N transformed_weight scaled_weight 0 1 35.114100 -1.468240 1 1 55.569776 -0.333893 2 1 58.086143 -0.159002 3 1 34.307434 -1.502484 4 1 49.839743 -0.703244 ... ... ... ... 10994 1 39.217343 -1.281730 10995 0 35.312887 -1.459679 10996 0 33.985291 -1.515937 10997 0 34.785054 -1.482304 10998 0 40.484565 -1.219968 [10999 rows x 14 columns] In [25]: from sklearn.linear_model import LinearRegression # Create the SLR model model_slr = LinearRegression() # Prepare the features and target variables X_slr = df[['Cost_of_the_Product']] y_slr = df['Weight_in_gms'] # Fit the model
  • 9. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 9/38 R-squared: 0.01758383342649017 Mean Squared Error: 2627192.6380324196 model_slr.fit(X_slr, y_slr) # Predict the target variable y_pred_slr = model_slr.predict(X_slr) In [26]: from sklearn.linear_model import LinearRegression # Create the Multiple Linear Regression model model_multiple = LinearRegression() # Prepare the features and target variables X_multiple = df[['Cost_of_the_Product', 'Prior_purchases']] y_multiple = df['Weight_in_gms'] # Fit the model model_multiple.fit(X_multiple, y_multiple) # Predict the target variable y_pred_multiple = model_multiple.predict(X_multiple) In [27]: from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error # Create the SLR model model_slr = LinearRegression() # Prepare the features and target variables X_slr = df[['Cost_of_the_Product']] y_slr = df['Weight_in_gms'] # Fit the model model_slr.fit(X_slr, y_slr) # Predict the target variable y_pred_slr = model_slr.predict(X_slr) # Calculate R-squared r2 = r2_score(y_slr, y_pred_slr) # Calculate MSE mse = mean_squared_error(y_slr, y_pred_slr) # Print the R-squared and MSE values print("R-squared:", r2) print("Mean Squared Error:", mse) In [29]: import matplotlib.pyplot as plt # Extract the column values for plotting x_values = df['Cost_of_the_Product'] y_actual = df['Weight_in_gms'] # Plot the actual data points plt.scatter(x_values, y_actual, color='blue', label='Actual') # Plot the regression line plt.plot(x_values, y_pred_slr, color='red', label='Regression Line') # Set plot labels and title plt.xlabel('Cost of the Product')
  • 10. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 10/38 plt.ylabel('Weight in grams') plt.title('Simple Linear Regression') # Display legend plt.legend() # Show the plot plt.show() In [42]: import matplotlib.pyplot as plt # Plot the actual values plt.scatter(X_multiple['Cost_of_the_Product'], y_multiple, color='blue', label='Act # Plot the predicted values plt.scatter(X_multiple['Cost_of_the_Product'], y_pred_multiple, color='red', label= # Add the Prior_purchases column to the plot plt.scatter(X_multiple['Cost_of_the_Product'], X_multiple['Prior_purchases'], color # Set plot labels and title plt.xlabel('Cost_of_the_Product') plt.ylabel('Weight_in_gms') plt.title('Actual vs Predicted Values') # Add a legend plt.legend() # Display the plot plt.show()
  • 11. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 11/38 In [46]: from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt # Create the Logistic Regression model model_logistic = LogisticRegression() # Prepare the features and target variables X_logistic = df[['Cost_of_the_Product', 'Prior_purchases']] y_logistic = df['Reached.on.Time_Y.N'] # Fit the model model_logistic.fit(X_logistic, y_logistic) # Predict the target variable y_pred_logistic = model_logistic.predict(X_logistic) # Create a scatter plot of the predicted values plt.scatter(range(len(y_pred_logistic)), y_pred_logistic, color='blue', label='Pred # Plot the actual values plt.scatter(range(len(y_logistic)), y_logistic, color='red', label='Actual') # Set plot labels and title plt.xlabel('Data Point') plt.ylabel('Reached on Time') plt.title('Logistic Regression Predictions') # Add a legend plt.legend() # Show the plot plt.show()
  • 12. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 12/38 Accuracy: 0.5545454545454546 In [52]: from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # Prepare the features and target variables X = df[['Cost_of_the_Product', 'Prior_purchases']] y = df['Reached.on.Time_Y.N'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create the Decision Tree classifier model_decision_tree = DecisionTreeClassifier() # Fit the model on the training data model_decision_tree.fit(X_train, y_train) # Predict the target variable for the test data y_pred = model_decision_tree.predict(X_test) # Calculate the accuracy of the model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [53]: from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # Prepare the features and target variables X = df[['Cost_of_the_Product', 'Prior_purchases']] y = df['Reached.on.Time_Y.N'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create the Random Forest classifier model_random_forest = RandomForestClassifier() # Fit the model on the training data model_random_forest.fit(X_train, y_train) # Predict the target variable for the test data y_pred = model_random_forest.predict(X_test)
  • 13. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 13/38 Accuracy: 0.5622727272727273 Accuracy: 0.5931818181818181 Accuracy: 0.5636363636363636 # Calculate the accuracy of the model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [54]: from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # Prepare the features and target variables X = df[['Cost_of_the_Product', 'Prior_purchases']] y = df['Reached.on.Time_Y.N'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create the SVM classifier model_svm = SVC() # Fit the model on the training data model_svm.fit(X_train, y_train) # Predict the target variable for the test data y_pred = model_svm.predict(X_test) # Calculate the accuracy of the model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [55]: from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # Prepare the features and target variables X = df[['Cost_of_the_Product', 'Prior_purchases']] y = df['Reached.on.Time_Y.N'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create the base classifier base_classifier = DecisionTreeClassifier() # Create the bagging classifier model_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=10, # Fit the model on the training data model_bagging.fit(X_train, y_train) # Predict the target variable for the test data y_pred = model_bagging.predict(X_test) # Calculate the accuracy of the model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [56]: from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split
  • 14. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 14/38 Accuracy: 0.5645454545454546 from sklearn.metrics import accuracy_score # Prepare the features and target variables X = df[['Cost_of_the_Product', 'Prior_purchases']] y = df['Reached.on.Time_Y.N'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create the base classifier base_classifier = DecisionTreeClassifier() # Create the AdaBoost classifier model_adaboost = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=10 # Fit the model on the training data model_adaboost.fit(X_train, y_train) # Predict the target variable for the test data y_pred = model_adaboost.predict(X_test) # Calculate the accuracy of the model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [57]: from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # Prepare the features X = df[['Cost_of_the_Product', 'Prior_purchases']] # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Perform PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # Plot the data points in the reduced dimension space plt.scatter(X_pca[:, 0], X_pca[:, 1]) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title('PCA') plt.show()
  • 15. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 15/38 In [58]: from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # Prepare the features X = df[['Cost_of_the_Product', 'Prior_purchases']] # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Instantiate the K-Means clustering algorithm kmeans = KMeans(n_clusters=3, random_state=42) # Fit the model on the scaled data kmeans.fit(X_scaled) # Predict the cluster labels labels = kmeans.predict(X_scaled) # Plot the data points with color-coded clusters plt.scatter(X['Cost_of_the_Product'], X['Prior_purchases'], c=labels) plt.xlabel('Cost of the Product') plt.ylabel('Prior Purchases') plt.title('K-Means Clustering') plt.show()
  • 16. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 16/38 Accuracy: 0.5895454545454546 In [3]: import pandas as pd from sklearn.model_selection import train_test_split # Read the dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta In [4]: import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # Create and fit the logistic regression model model = LogisticRegression() model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) # Calculate accuracy accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [5]: import numpy as np # Fit the logistic regression model model = LogisticRegression() model.fit(X_train, y_train) # Predict on the training set y_train_pred = model.predict(X_train) # Predict on the test set y_test_pred = model.predict(X_test) # Calculate the training accuracy train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate the test accuracy test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate the bias error bias_error = np.mean(np.abs(y_train_pred - y_train)) # Calculate the variance error variance_error = np.mean(np.abs(y_test_pred - y_test)) print("Training Accuracy:", train_accuracy) print("Test Accuracy:", test_accuracy) print("Bias Error:", bias_error) print("Variance Error:", variance_error)
  • 17. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 17/38 Training Accuracy: 0.599613592453688 Test Accuracy: 0.5895454545454546 Bias Error: 0.4003864075463121 Variance Error: 0.41045454545454546 Cross-Validation Scores: [0.59727273 0.59681818 0.59681818 0.55954545 0.55025011] Mean CV Accuracy: 0.5801409318285171 Ensemble Model Accuracy: 0.5281818181818182 Best Parameters: {'C': 0.1} Best Model Accuracy: 0.5895454545454546 In [6]: from sklearn.model_selection import cross_val_score, GridSearchCV from sklearn.ensemble import RandomForestClassifier # Define the logistic regression model model = LogisticRegression() # Perform cross-validation cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') print("Cross-Validation Scores:", cross_val_scores) print("Mean CV Accuracy:", cross_val_scores.mean()) In [7]: # Use ensemble models (Random Forest) ensemble_model = RandomForestClassifier(n_estimators=100, random_state=42) ensemble_model.fit(X_train, y_train) ensemble_accuracy = ensemble_model.score(X_test, y_test) print("Ensemble Model Accuracy:", ensemble_accuracy) In [8]: # Perform hyperparameter tuning using GridSearchCV parameters = {'C': [0.1, 1, 10]} grid_search = GridSearchCV(model, parameters, cv=5) grid_search.fit(X_train, y_train) best_params = grid_search.best_params_ print("Best Parameters:", best_params) In [9]: # Use the best parameters to fit the model best_model = LogisticRegression(C=best_params['C']) best_model.fit(X_train, y_train) best_model_accuracy = best_model.score(X_test, y_test) print("Best Model Accuracy:", best_model_accuracy) In [11]: import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split import numpy as np # Read the dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for regression # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the dependent variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create and fit the linear regression model model = LinearRegression() model.fit(X_train, y_train)
  • 18. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 18/38 RMSE: 0.49030942756132945 R-squared: 0.0037867760725102118 Classification Report: precision recall f1-score support 0 0.43 0.03 0.05 895 1 0.59 0.98 0.74 1305 accuracy 0.59 2200 macro avg 0.51 0.50 0.39 2200 weighted avg 0.53 0.59 0.46 2200 AUC-ROC Score: 0.5386027098182752 # Predict on the test set y_pred = model.predict(X_test) # Calculate RMSE rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # Calculate R-squared value r2 = r2_score(y_test, y_pred) # Print the evaluation metrics print("RMSE:", rmse) print("R-squared:", r2) In [13]: import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, roc_auc_score from sklearn.model_selection import train_test_split # Read the dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for classification # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create and fit the logistic regression model model = LogisticRegression() model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of positive class # Generate classification report classification_report = classification_report(y_test, y_pred) print("Classification Report:") print(classification_report) # Calculate AUC-ROC score auc_roc = roc_auc_score(y_test, y_pred_proba) print("AUC-ROC Score:", auc_roc) In [15]: import pandas as pd from sklearn.cluster import KMeans
  • 19. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 19/38 Inertia Value: 7956.899346042599 from sklearn.preprocessing import StandardScaler # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for clustering # For example, let's consider 'Customer care calls' and 'Cost of the product' as th X = df[['Customer_care_calls', 'Cost_of_the_Product']] # Perform feature scaling scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Create and fit the K-means clustering model kmeans = KMeans(n_clusters=3, random_state=42) kmeans.fit(X_scaled) # Calculate the inertia value inertia = kmeans.inertia_ # Print the inertia value print("Inertia Value:", inertia) In [16]: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Define the models to compare models = [ ('Logistic Regression', LogisticRegression()), ('Decision Tree', DecisionTreeClassifier()), ('Random Forest', RandomForestClassifier()) ] # Iterate over the models for model_name, model in models: # Fit the model on the training data model.fit(X_train, y_train) # Make predictions on the test data y_pred = model.predict(X_test) # Calculate evaluation metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred)
  • 20. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 20/38 Model: Logistic Regression Accuracy: 0.5895454545454546 Precision: 0.5936626281453867 Recall: 0.9762452107279693 F1-score: 0.7383367139959431 ------------------------- Model: Decision Tree Accuracy: 0.5109090909090909 Precision: 0.5984522785898538 Recall: 0.5333333333333333 F1-score: 0.5640194489465155 ------------------------- Model: Random Forest Accuracy: 0.5322727272727272 Precision: 0.5997109826589595 Recall: 0.6360153256704981 F1-score: 0.6173298624023801 ------------------------- # Print the results print(f"Model: {model_name}") print(f"Accuracy: {accuracy}") print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F1-score: {f1}") print("-------------------------") In [17]: import time import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create and fit the logistic regression model model = LogisticRegression() start_time = time.time() model.fit(X_train, y_train) end_time = time.time() # Calculate the time taken for model training training_time = end_time - start_time print("Training Time:", training_time) # Make predictions on the test set start_time = time.time() y_pred = model.predict(X_test) end_time = time.time() # Calculate the time taken for making predictions prediction_time = end_time - start_time print("Prediction Time:", prediction_time)
  • 21. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 21/38 Training Time: 0.04000043869018555 Prediction Time: 0.002998828887939453 In [18]: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for EDA # For example, let's consider 'Customer care calls', 'Customer rating', 'Cost of th selected_columns = ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product df_selected = df[selected_columns] # Correlation Matrix correlation_matrix = df_selected.corr() plt.figure(figsize=(8, 6)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.title('Correlation Matrix') plt.show() In [20]: # Pair Plots sns.pairplot(df_selected, hue='Reached.on.Time_Y.N', diag_kind='kde') plt.title('Pair Plots') plt.show()
  • 22. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 22/38 In [21]: # Box Plots plt.figure(figsize=(10, 6)) for i, column in enumerate(selected_columns[:-1]): plt.subplot(2, 2, i+1) sns.boxplot(x='Reached.on.Time_Y.N', y=column, data=df_selected) plt.xlabel('Reached.on.Time_Y.N') plt.ylabel(column) plt.tight_layout() plt.suptitle('Box Plots', y=1.05) plt.show()
  • 23. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 23/38 In [22]: # Distribution Plots plt.figure(figsize=(10, 6)) for i, column in enumerate(selected_columns[:-1]): plt.subplot(2, 2, i+1) sns.histplot(data=df_selected, x=column, hue='Reached.on.Time_Y.N', kde=True) plt.xlabel(column) plt.ylabel('Count') plt.tight_layout() plt.suptitle('Distribution Plots', y=1.05) plt.show() In [23]: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score
  • 24. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 24/38 Accuracy: 0.5895454545454546 # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create and fit the logistic regression model model = LogisticRegression() model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) # Calculate accuracy accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) In [5]: import pandas as pd import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] # Model Parameters model_params = { 'solver': 'lbfgs', 'C': 1.0, 'max_iter': 100, # Add other model parameters as needed } # Create a bar chart for model parameters fig = px.bar( x=list(model_params.keys()), y=list(model_params.values()), labels={'x': 'Parameter', 'y': 'Value'}, title='Model Parameters' ) fig.show() # Create a table for model parameters table_data = [['Parameter', 'Value']] table_data.extend(list(model_params.items())) fig = go.Figure(data=[go.Table(header=dict(values=table_data[0]), cells=dict(values fig.show()
  • 25. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 25/38
  • 26. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 26/38 In [25]: import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Define a range of parameter values to test parameter_values = [0.001, 0.01, 0.1, 1, 10, 100] # Initialize lists to store parameter values and corresponding accuracy scores parameters = [] accuracy_scores = [] # Iterate over the parameter values for param in parameter_values: # Create and fit the logistic regression model with the current parameter value model = LogisticRegression(C=param)
  • 27. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 27/38 model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) # Calculate accuracy and append to the lists accuracy = accuracy_score(y_test, y_pred) parameters.append(param) accuracy_scores.append(accuracy) # Plot the performance with varying parameters plt.plot(parameters, accuracy_scores, marker='o') plt.xlabel('Parameter Value') plt.ylabel('Accuracy') plt.title('Model Performance with Varying Parameters') plt.xticks(np.arange(min(parameters), max(parameters) + 1, 1.0)) plt.show() In [27]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta # Create and fit the logistic regression model model = LogisticRegression() model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) # Generate classification report classification_metrics = classification_report(y_test, y_pred, output_dict=True)
  • 28. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 28/38 Metrics not available for class label: accuracy # Extract metrics and class labels metrics = ['precision', 'recall', 'f1-score', 'support'] class_labels = list(classification_metrics.keys())[:-1] # Exclude 'macro avg' and # Create a DataFrame to store the metrics metrics_df = pd.DataFrame(index=class_labels, columns=metrics) for label in class_labels: if isinstance(classification_metrics[label], dict): metrics_df.loc[label] = [classification_metrics[label][metric] for metric i else: print(f"Metrics not available for class label: {label}") # Plot the metrics sns.set_style("whitegrid") metrics_df.plot(kind='bar', figsize=(10, 6)) plt.xlabel('Class Labels') plt.ylabel('Metric Score') plt.title('Model Metrics - Classification Report') plt.legend(title='Metrics', bbox_to_anchor=(1, 1)) plt.show() In [28]: import pandas as pd from sklearn.linear_model import LogisticRegression # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Create and fit the logistic regression model model = LogisticRegression() model.fit(X, y) # Predict the target variable for the entire dataset y_pred = model.predict(X)
  • 29. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 29/38 Reached.on.Time_Y.N Predicted Reached on time 0 1 1 1 1 1 2 1 1 3 1 1 4 1 1 ... ... ... 10994 1 1 10995 0 1 10996 0 1 10997 0 1 10998 0 1 [10999 rows x 2 columns] Percentage of correct predictions: 59.6872442949359 # Add the predicted values to the DataFrame df['Predicted Reached on time'] = y_pred # Print the testing outcome of the whole E-commerce dataset print(df[['Reached.on.Time_Y.N', 'Predicted Reached on time']]) In [29]: import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for modeling # For example, let's consider 'Customer care calls', 'Customer rating', and 'Cost o # and 'Reached on time' as the target variable X = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']] y = df['Reached.on.Time_Y.N'] # Create and fit the logistic regression model model = LogisticRegression() model.fit(X, y) # Predict the target variable for the entire dataset y_pred = model.predict(X) # Calculate the accuracy accuracy = accuracy_score(y, y_pred) percentage_correct = accuracy * 100 # Print the percentage of correct predictions print("Percentage of correct predictions:", percentage_correct) In [3]: import pandas as pd import matplotlib.pyplot as plt import plotly.graph_objects as go # Read the E-commerce dat# Filter successful and obvious cases success_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 0] # Assuming 0 obvious_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1 # Create visualizations for the successful cases # Example: Histogram of Customer Ratings plt.figure(figsize=(8, 6)) plt.hist(success_cases['Customer_rating'], bins=10, alpha=0.5, color='green') plt.xlabel('Customer_rating') plt.ylabel('Count')
  • 30. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 30/38 plt.title('Distribution of Customer_ratings for Successful Deliveries') plt.show() # Create visualizations for the obvious cases # Example: Scatter Plot of Cost of the Product vs. Customer Rating fig = go.Figure(data=go.Scatter( x=obvious_cases['Cost_of_the_Product'], y=obvious_cases['Customer_rating'], mode='markers', marker=dict(color='red') )) fig.update_layout( title='Cost of the Product vs. Customer Rating for Obvious Failures', xaxis_title='Cost of the Product', yaxis_title='Customer Rating', ) fig.show()
  • 31. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 31/38 In [2]: import pandas as pd import matplotlib.pyplot as plt import plotly.graph_objects as go import plotly.subplots as sp # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for analysis # For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N'] # Filter failure cases failure_cases = df_selected[df_selected['Reached.on.Time_Y.N'] == 1] # Assuming 1 # Create visualizations for the failure cases # Example 1: Box Plot of Customer Rating plt.figure(figsize=(8, 6)) plt.boxplot(failure_cases['Customer_rating']) plt.xlabel('Failure Cases') plt.ylabel('Customer_rating') plt.title('Distribution of Customer Ratings for Failure Cases') plt.show() # Example 2: Scatter Plot of Cost of the Product vs. Customer Rating fig = go.Figure(data=go.Scatter( x=failure_cases['Cost_of_the_Product'], y=failure_cases['Customer_rating'], mode='markers', marker=dict(color='red')
  • 32. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 32/38 )) fig.update_layout( title='Cost of the Product vs. Customer Rating for Failure Cases', xaxis_title='Cost of the Product', yaxis_title='Customer Rating', ) fig.show()
  • 33. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 33/38 In [4]: import pandas as pd import matplotlib.pyplot as plt import plotly.graph_objects as go # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for analysis # For example, let's consider 'Customer rating', 'Cost of the product', and 'Reache df_selected = df[['Customer_rating', 'Cost_of_the_Product', 'Reached.on.Time_Y.N'] # Filter border cases border_cases = df_selected[ (df_selected['Reached.on.Time_Y.N'] == 0) | (df_selected['Reached.on.Time_Y.N' ] # Assuming 0 indicates successful delivery, and 1 indicates failure # Create visualizations for the border cases # Example 1: Scatter Plot of Cost of the Product vs. Customer Rating fig = go.Figure(data=go.Scatter( x=border_cases['Cost_of_the_Product'], y=border_cases['Customer_rating'], mode='markers', marker=dict( color=border_cases['Reached.on.Time_Y.N'], colorscale='Viridis', showscale=True ) )) fig.update_layout( title='Cost of the Product vs. Customer Rating (Border Cases)',
  • 34. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 34/38 xaxis_title='Cost of the Product', yaxis_title='Customer Rating', ) fig.show() # Example 2: Pie Chart of Reached on Time vs. Not Reached on Time counts = border_cases['Reached.on.Time_Y.N'].value_counts() fig = go.Figure(data=go.Pie(labels=counts.index, values=counts.values)) fig.update_layout( title='Distribution of Reached on Time vs. Not Reached on Time (Border Cases)', ) fig.show()
  • 35. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 35/38 In [7]: import pandas as pd import matplotlib.pyplot as plt # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Select the relevant columns for analysis # For example, let's consider 'Customer rating' and 'Reached on time' df_selected = df[['Customer_rating', 'Reached.on.Time_Y.N']] # Calculate the proportion of on-time deliveries for each customer rating grouped_data = df_selected.groupby('Customer_rating')['Reached.on.Time_Y.N'].mean( # Sort the data based on customer rating grouped_data.sort_values(by='Customer_rating', inplace=True) # Create a bar plot plt.figure(figsize=(8, 6)) plt.bar(grouped_data['Customer_rating'], grouped_data['Reached.on.Time_Y.N'], color plt.xlabel('Customer_Rating') plt.ylabel('Proportion of Product Delivered on Time') plt.title('Proportion of Product Delivered on Time by Customer Rating') plt.show()
  • 36. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 36/38 Percentage of Customer Ratings with Products Delivered on Time: 40.33% In [8]: import pandas as pd # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Calculate the percentage of customer ratings with products delivered on time total_ratings = df['Customer_rating'].count() on_time_ratings = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].count() # percentage_on_time = (on_time_ratings / total_ratings) * 100 print("Percentage of Customer Ratings with Products Delivered on Time: {:.2f}%".for In [9]: import pandas as pd # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Calculate the percentage of products delivered on time for each customer rating rating_counts = df['Customer_rating'].value_counts() on_time_counts = df[df['Reached.on.Time_Y.N'] == 0]['Customer_rating'].value_counts # Create a DataFrame with customer ratings and corresponding percentages rating_percentages = (on_time_counts / rating_counts) * 100 rating_table = pd.DataFrame({'Customer_rating': rating_percentages.index[:5], 'Percentage Delivered on Time': rating_percentages.val # Set the Customer Rating column as the index rating_table.set_index('Customer_rating', inplace=True) # Display the table print(rating_table)
  • 37. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 37/38 Percentage Delivered on Time Customer_rating 1 41.252796 2 41.200924 3 39.392586 4 40.475103 5 39.336711 Customer_care_calls 2 3 4 5 6 Customer_rating 1 0.630769 0.633968 0.574530 0.564155 0.508021 2 0.733871 0.617978 0.582985 0.575372 0.472826 3 0.616000 0.644817 0.612903 0.596413 0.509346 4 0.666667 0.600000 0.607242 0.591111 0.525822 5 0.617647 0.628099 0.609898 0.595745 0.558140 Customer_care_calls 7 Customer_rating 1 0.547170 2 0.456522 3 0.481481 4 0.485714 5 0.586207 In [13]: import pandas as pd import matplotlib.pyplot as plt # Read the E-commerce dataset into a pandas DataFrame df = pd.read_csv('E_Commerce.csv') # Group the data by customer rating and customer care calls grouped_data = df.groupby(['Customer_rating', 'Customer_care_calls'])['Reached.on.T # Pivot the data to create a table-like format pivot_table = grouped_data.pivot(index='Customer_rating', columns='Customer_care_ca # Plot the percentage graph pivot_table.plot(kind='bar', stacked=True) plt.xlabel('Customer_rating') plt.ylabel('Percentage of Products Delivered on Time') plt.title('Percentage of Products Delivered on Time by Customer Rating and Customer plt.legend(title='Customer_care_calls') plt.show() # Display the table print(pivot_table)
  • 38. 7/14/23, 10:47 PM E_Commerce file:///C:/Users/SILPI NANDI/Downloads/E_Commerce.html 38/38 In [ ]: