SlideShare a Scribd company logo
1 of 13
Download to read offline
In [1]: # Computational & Basic Python Libraries
import re
import numpy as np
import pandas as pd
from collections import Counter
# Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
# Machine Learning Libraries
from sklearn.cluster import KMeans
import statsmodels.formula.api as sm
import scipy.cluster.hierarchy as sch
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import AgglomerativeClustering
linReg = LinearRegression()
rndFor = RandomForestClassifier()
# Python NLTK Libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
PS = PorterStemmer()
stopWords = set(stopwords.words("english"))
stopWords_rus = set(stopwords.words("russian"))
Importing Data
All the texts are in Russian and I've no idea what the names are, so I converted everything to English using Translator() from Google Translator Library
(https://pypi.org/project/googletrans/). Below is the Code for that, if anyone is interested!
Notes
The English Translated Texts were saved to Drive with the Following Commands. There was some encoding problems with the file, as few characters were not properly translated to
English.
# Google Transaltor Library
from googletrans import Translator
translator = Translator()
# Shops Data
shopsData['shopName'] = shopsData['shop_name'].apply(lambda x: translator.translate(x).text)
shopsData[['shopName', 'shop_id']].to_csv('./data/shopsData_englishTranslated.csv', index=False, encoding='utf-8')
# Item Category Data
itemCategoryData['itemCategoryName'] = itemCategoryData['item_category_name'].apply(lambda x:
translator.translate(x).text)
itemCategoryData[['itemCategoryName', 'item_category_id']].to_csv('./data/itemCategory_englishTranslated.csv',
index=False, encoding='utf-8')
In case of Items Data it is advisable not to use the Translator as:
>> There are 22,171 Item Name and Converting them will take a huge amount of Time!
>> Translator() is not fully Stable and "ValueError Exception : No JSON Object could be decoded" was Generated.
In [2]: itemsData = pd.read_csv('./data/items.csv', encoding='utf-8')
shopsData = pd.read_csv('./data/shopsData_englishTranslated.csv')
itemCategoryData = pd.read_csv('./data/itemCategory_englishTranslated.csv')
In [2]: trainData = pd.read_csv('./data/sales_train_v2.csv')
testData = pd.read_csv('./data/test.csv')
In [28]: print trainData.info(), 'n'
print testData.info(), 'n'
print 'Column Names: ', [x for x in trainData.columns if x not in testData.columns]
Categorization of Data
Intution Behind: Generally people tend to buy stuffs from their Favourite Brand and also from their Favourite Store - depending on the store's performance and reputation. From the
given dataset, we have no idea about these information, but lets try to categorize the Shops and the Items on some other Features - using only their Names.
From one of the Link (https://www.shopify.in/blog/4660242-what-should-i-name-my-online-store) I've found on the internet, here are the few things that a Succesful Business have:
1. Short & Simple Name
2. Different from the Rest
3. Web Presence of the Business, etc.
Since, most of these points are not possible in case of our analysis - let's just focus on Name.
Shop Names: Using K-Means Clustering on the Shop Name - with two Independent Variables: Word Count and Character Count in Shop Names, after omiting special characters.
Item Category Name: For classification of Item Category Name, I will use some Text Classification like:
>> Finding the list of Top 25 Most Used Words in the Category Name, then
>> Making a list of those words and passing it as a Parameter to CountVectorizer() under Text Feature Extraction to find the Name Strength
of Item Category Name.
Item Name: Only Focusing on the Name of the Item, passing the Item Name Length as a Parameter for Machine Learning Algorithm, after removing the Stop Words from the Name.
Finally, for each Categorizations - we use:
data[['Category_1', 'Category_2']] = pd.get_dummies(data, drop_first=True, columns=['Category_1', 'Category_2'])
In [6]: def textLengthCount(text):
# This Function takes in the Text - Clears the text of Special Characters and Returns Word Count and Character Count
cleanText = re.sub('[^A-Za-z0-9 ]+', '', text)
wordCount = len(cleanText.split())
return(wordCount, len(cleanText) - wordCount -1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date object
date_block_num int64
shop_id int64
item_id int64
item_price float64
item_cnt_day float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
ID 214200 non-null int64
shop_id 214200 non-null int64
item_id 214200 non-null int64
dtypes: int64(3)
memory usage: 4.9 MB
None
Column Names: ['date', 'date_block_num', 'item_price', 'item_cnt_day']
In [7]: %%time
shopsData['shopName_wordCount'] = shopsData['shopName'].apply(lambda x: textLengthCount(x)[0])
shopsData['shopName_lenCount'] = shopsData['shopName'].apply(lambda x: textLengthCount(x)[1])
X = shopsData.iloc[:, [2, 3]].values # Creating an 2D-Array for K-Means
WCSS = [] # for Shops Name : from the Chart we can Clearly figure out that the Optimal No. of Category(K) = 3
for i in range(1, 7):
kMeans_Cluster = KMeans(n_clusters= i, init='k-means++')
kMeans_Cluster.fit(X)
WCSS.append(kMeans_Cluster.inertia_)
# Visualizing the WCSS Value
plt.figure(figsize=(15, 5))
plt.plot(range(1, 7), WCSS, 'o-')
plt.xticks(range(1, 7))
plt.title('Within Center Sum of Squared Value')
plt.xlabel('No. of Clusters')
plt.ylabel('WCSS')
In [8]: %%time
kMeans_Cluster = KMeans(n_clusters=3)
y_kMeans = kMeans_Cluster.fit_predict(X)
In [9]: # Visualizing the Shop Category
plt.figure(figsize=(15, 8))
plt.scatter(X[y_kMeans == 0, 0], X[y_kMeans == 0, 1], c='red', label='Small Name Length')
plt.scatter(X[y_kMeans == 1, 0], X[y_kMeans == 1, 1], c='cyan', label='Average Name Length')
plt.scatter(X[y_kMeans == 2, 0], X[y_kMeans == 2, 1], c='blue', label='Large Name Length')
plt.scatter(kMeans_Cluster.cluster_centers_[:, 0], kMeans_Cluster.cluster_centers_[:, 1], s=100, c='black', label='Centroids')
plt.title('Shop Classification using K-Means')
plt.xlabel('Word Count in Shop Name')
plt.ylabel('Character Count in Shop Name')
plt.legend()
Wall time: 898 ms
Wall time: 32 ms
Out[9]: <matplotlib.legend.Legend at 0x10955d30>
In [71]: %%time
# Creating Shops Category from the Predicted Result, and appending it to the Data Set
shopsData['shopCategory'] = y_kMeans
shopsData[['shop_Cat0', 'shop_Cat1', 'shop_Cat2']] = pd.get_dummies(shopsData['shopCategory'])
In [13]: %%time
itemCategoryData['itemCategoryName_Cleaned'] = itemCategoryData['itemCategoryName'].apply(
lambda x: str(re.sub('[^A-Za-z0-9]', ' ', x)).replace(' ', ' ')).apply(
lambda x: PS.stem(x)).apply(
lambda x: ' '.join([i for i in word_tokenize(x)if i not in stopWords]))
In [14]: %%time
allWords = []
for i in range(len(itemCategoryData)):
tempWords = word_tokenize(itemCategoryData['itemCategoryName_Cleaned'][i])
for j in tempWords:
if(j == 'game'): # Stemmer was not able to Distinguish 'games' and 'game' as a Single Word
j = 'games'
allWords.append(j)
mostCommonWords = []
for i in Counter(allWords).most_common(25):
mostCommonWords.append(i[0])
print 'List of Most Common Word Item Category Name:n', Counter(allWords).most_common(25), 'n'
In [15]: %%time
# Passing the Most Common Words as Vocabulary and Building the Name Strength Meter
countVectorizer = CountVectorizer(vocabulary=mostCommonWords)
itemCategoryData['itemCategory_nameStrength'] = np.array(countVectorizer
.fit_transform(itemCategoryData['itemCategoryName_Cleaned'])
.toarray()).mean(axis=1)
In [16]: %%time
itemsData['nameLen'] = itemsData['item_name'].apply(
lambda x: ' '.join([i for i in word_tokenize(x)if i not in stopWords_rus])).apply(len)
# Saving Output
itemsData.drop('item_name', axis=1).to_csv('./output/itemsData_categorised_v1.csv', index=False)
shopsData.drop(['shopName', 'shopCategory'], axis=1).to_csv('./output/shopsData_categorised_v1.csv', index=False)
itemCategoryData.drop(['itemCategoryName', 'itemCategoryName_Cleaned'],
axis=1).to_csv('./output/itemCategoryDate_categorised_v1.csv', index=False)
Validating the Categories with Item Count
After Categorization of the Data - checking if these Categories can correctly predict Item Sale for a Month Daily - based on the mean of Item Sale per Item, per Shop. For this, I'm
using P-Value to determine how well these categories define the sale of an Item.
In [38]: %%time
avgItemSale_perShop = trainData[[u'shop_id', u'item_id', u'item_cnt_day']].groupby(
[u'shop_id', u'item_id']).mean().reset_index()
In [74]: %%time
# X_dataTemp : temporary variable for Building OLS Summary
X_dataTemp = avgItemSale_perShop.merge(
itemsData.drop('item_name', axis=1), on='item_id', how='left').merge(
shopsData.drop(['shopName', 'shopCategory'], axis=1), on='shop_id', how='left').merge(
itemCategoryData[['item_category_id', 'itemCategory_nameStrength']], on='item_category_id', how='left')
X_dataTemp['constant_B0'] = 1 # Since in statsmodels.formula.api() the Intercept is not Included by Default
In [91]: X_optimumFeatures = X_dataTemp.iloc[:, [11, 4, 5, 10]]
y = X_dataTemp.iloc[:, 2]
In [92]: %%time
# OLS : Oridinary Least Square
regressor_OLS = sm.OLS(endog=y, exog=X_optimumFeatures).fit()
Wall time: 2 ms
Wall time: 207 ms
List of Most Common Word Item Category Name:
[('games', 24), ('books', 12), (u'gifts', 12), ('accessories', 8), ('consoles', 8), ('music', 7), ('xbox', 6), ('programs', 6), (u'p
c', 5), ('cards', 5), ('cinema', 5), ('payment', 5), (u'edit', 4), ('ps2', 3), ('ps3', 3), ('ps4', 3), ('digit', 3), ('psp', 3), ('p
svita', 3), ('blu', 3), ('360', 3), (u'literatur', 3), ('ray', 3), (u'offic', 2), (u'board', 2)]
Wall time: 21 ms
Wall time: 4 ms
Wall time: 4.73 s
Wall time: 655 ms
Wall time: 341 ms
Wall time: 102 ms
In [93]: %%time
regressor_OLS.summary()
Conclusion: Generally for Backward Elimination Process, optimal value of is is to be under:
From the OLS Summary, shopName_lenCount : , thus not considering the Feature.
But, while considering Shop Category (by using drop_first=True) to minimise Dummy Variable Trap, , from which it can be concluded, that Categorization of the Shop
based on Name is not a Good Idea!
Thus, considering the following Features, which seems to be a Good Item Count Predictor for a Month!
In [95]: X_optimumFeatures.drop(u'constant_B0', axis=1).columns
Classification of Item Price
An Item's Price should be a very important deterministic factor while considering sale of the Item on a Daily Basis.
Exploring Training & Testing Data based on Item Price, Shop ID and Item ID.
In [106]: %%time
print 'Train Datan=========='
print 'No. of Unique Shops:', trainData['shop_id'].nunique()
print 'No. of Unique Items:', trainData['item_id'].nunique()
print 'nTest Datan========='
print 'No. of Unique Shops:', testData['shop_id'].nunique()
print 'No. of Unique Items:', testData['item_id'].nunique()
print 'n'
print 'Train-Test Data Mis-Match Count:', len([x for x in trainData['item_id'].unique()
if x not in testData['item_id'].unique()])
print 'Test-Train Data Mis-Match Count:', len([x for x in testData['item_id'].unique()
if x not in trainData['item_id'].unique()])
print 'n'
P > |t|
SignificanceLevel(SL) <= 5%
P > |t| = 0.581, SL > 5%
SL > 5%
Wall time: 121 ms
Out[93]:
OLS Regression Results
Dep. Variable: item_cnt_day R-squared: 0.001
Model: OLS Adj. R-squared: 0.001
Method: Least Squares F-statistic: 174.6
Date: Wed, 23 May 2018 Prob (F-statistic): 4.16e-113
Time: 15:03:01 Log-Likelihood: -6.5911e+05
No. Observations: 424124 AIC: 1.318e+06
Df Residuals: 424120 BIC: 1.318e+06
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
constant_B0 1.0604 0.007 142.870 0.000 1.046 1.075
nameLen 0.0010 8.58e-05 12.089 0.000 0.001 0.001
shopName_wordCount -0.0212 0.002 -13.330 0.000 -0.024 -0.018
itemCategory_nameStrength 0.6110 0.052 11.749 0.000 0.509 0.713
Omnibus: 2612880.482 Durbin-Watson: 1.895
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1286148985691768.750
Skew: 468.684 Prob(JB): 0.00
Kurtosis: 269778.319 Cond. No. 1.33e+03
Out[95]: Index([u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength'], dtype='object')
Train Data
==========
No. of Unique Shops: 60
No. of Unique Items: 21807
Test Data
=========
No. of Unique Shops: 42
No. of Unique Items: 5100
Train-Test Data Mis-Match Count: 17070
Test-Train Data Mis-Match Count: 363
Wall time: 4min 33s
Conclusion:
1. As we can see there is a Mis-match in the number of Items among Training & Testing Data.
2. Most of the Items are Present in Training Data,
3. However, there are 363 unique and new Items in test data - which is not Present in Training Set.
4. Also, there are no new Shops in Testing Data.
One Important Criterion which seems as an Important Deciding Factor is the Classification of Item ID with Shop ID.
To Classify the Item Price, let's do the Following things:
1. Classifying the Shops, based on the Number of Items Availible
2. Classifying the Items, based on their Availibility
3. Finding the mean(), median() and std() price of the Item, on the basis of:
a. per Item, per Shop, per Month
b. Combining the above parameters, and finding new Categories
Classifying the Shops, based on the Number of Items Availible
Intution Behind: Shops with more number of items might attract more amount of Customers - hence more amount of sale!
In [12]: %%time
totalCount_uniqueItems_perShop_trainData = trainData[['shop_id',
'item_id']].groupby('shop_id').item_id.nunique().reset_index().rename(
columns={
'item_id' : 'Count_uniqueItems'
})
totalCount_uniqueItems_perShop_testData = testData[['shop_id',
'item_id']].groupby('shop_id').item_id.nunique().reset_index().rename(
columns={
'item_id' : 'Count_uniqueItems'
})
In [13]: %%time
plt.figure(figsize=(15, 5))
plt.scatter(totalCount_uniqueItems_perShop_trainData['shop_id'], totalCount_uniqueItems_perShop_trainData['Count_uniqueItems'])
plt.title('Scatter Plot of Shop ID and No. of Unique Items')
plt.xlabel('Shop ID')
plt.ylabel('Total No. of Unique Items per Shop')
In [14]: X = totalCount_uniqueItems_perShop_trainData.iloc[:, :].values
Wall time: 787 ms
Wall time: 54 ms
In [15]: %%time
plt.figure(figsize=(25, 10))
plt.rcParams.update({'font.size': 15})
dendoGram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.axhline(y=15000, color='black', linestyle='-')
plt.axhline(y=12500, color='black', linestyle='--')
plt.axhline(y=17500, color='black', linestyle='--')
plt.title('Dendo Grams')
plt.xlabel('Clusters/Points')
plt.ylabel('Euclidean Distance')
plt.yticks([5000, 10000, 12500, 15000, 17500, 20000, 25000, 30000])
From the Dendogram:
The Optimal Number of Cluster is : 3
Based on the Optimal Number of Categories, categorising the Shops, based on the Number of Items available - for both Train & Test Data.
In [16]: X_testData = totalCount_uniqueItems_perShop_testData.iloc[:, :].values
In [17]: %%time
agg_HC = AgglomerativeClustering(n_clusters=3) # Affinity & Linkage is set to Default
X_HC_trainData = agg_HC.fit_predict(X)
X_HC_testData = agg_HC.fit_predict(X_testData)
In [18]: plt.figure(figsize=(15, 5))
plt.rcParams.update({'font.size': 10})
plt.scatter(X[X_HC_trainData == 0, 0], X[X_HC_trainData == 0, 1], c = 'red', label = 'Low Items')
plt.scatter(X[X_HC_trainData == 1, 0], X[X_HC_trainData == 1, 1], c = 'blue', label = 'High Items')
plt.scatter(X[X_HC_trainData == 2, 0], X[X_HC_trainData == 2, 1], c = 'green', label = 'Average Items')
plt.title('Cluster of Shop ID and No. of Unique Items')
plt.xlabel('Shop ID')
plt.ylabel('Total No. of Unique Items per Shop')
plt.legend(loc=4)
In [19]: totalCount_uniqueItems_perShop_trainData['shop_itemAvailibility'] = X_HC_trainData
totalCount_uniqueItems_perShop_testData['shop_itemAvailibility'] = X_HC_testData
T hreshold(ฮ˜) = 15000units
Wall time: 184 ms
Wall time: 1e+03 ยตs
Out[18]: <matplotlib.legend.Legend at 0xecef5f8>
In [29]: totalCount_uniqueItems_perShop_trainData[['lowItemCount', 'highItemCount']] = pd.get_dummies(
totalCount_uniqueItems_perShop_trainData['shop_itemAvailibility'])[[0, 1]]
totalCount_uniqueItems_perShop_testData[['lowItemCount', 'highItemCount']] = pd.get_dummies(
totalCount_uniqueItems_perShop_testData['shop_itemAvailibility'])[[0, 1]]
# Saving Output
totalCount_uniqueItems_perShop_trainData.to_csv('./output/totalCount_uniqueItems_perShop_trainData.csv', index=False)
totalCount_uniqueItems_perShop_testData.to_csv('./output/totalCount_uniqueItems_perShop_testData.csv', index=False)
Classifying the Items Based on their Availibilty
Intution Behind: Items which are widely available should have a distributed selling, but incase the item is available at small amount of Shops, sell of those items from those shops
should be high, or vice-versa and Machine Learning Model should be able to Pick up that Parameter!
In [22]: %%time
itemAvailibility_train = trainData[['shop_id', 'item_id']].groupby('item_id').shop_id.nunique().reset_index()
itemAvailibility_train = itemAvailibility_train.rename(columns={
'shop_id' : 'shopCount'
})
itemAvailibility_test = testData[['shop_id', 'item_id']].groupby('item_id').shop_id.nunique().reset_index()
itemAvailibility_test = itemAvailibility_test.rename(columns={
'shop_id' : 'shopCount'
})
In [25]: %%time
plt.figure(figsize=(15, 6))
plt.rcParams.update({'font.size': 12.5})
plt.plot(itemAvailibility_train.groupby('shopCount').item_id.nunique().reset_index()['item_id'].apply(lambda x: x/2000.0),
label='Distribution (2e^-3)')
plt.boxplot(itemAvailibility_train['shopCount'], showmeans=True, vert=False)
plt.axvline(itemAvailibility_train['shopCount'].median(), color='black', linestyle='--', alpha=0.5)
plt.axvline(itemAvailibility_train['shopCount'].std(), color='cyan', linestyle='--', alpha=0.5, label='Standard Deviation')
plt.axvline(3.0, color='black', linestyle='--', alpha=0.5)
plt.axvline(34.0, color='black', linestyle='--', alpha=0.5)
plt.axvline(42.0, color='magenta', linestyle='--', alpha=0.5, label='Test Data Shop Count')
plt.title('Distribution of Shop Count')
plt.xlabel('Shop Count per Item')
plt.ylabel('No. of Shops per the Variations of Shop Count')
plt.xticks([0, 5, 10, itemAvailibility_train['shopCount'].median(), itemAvailibility_train['shopCount'].mean(),
25, 30, 35, 40, 45, 50, 55, 60])
plt.yticks([0, 2])
plt.legend()
Wall time: 711 ms
Wall time: 91 ms
Based on the above Graph, we can conclude the following:
1. Most of the Items are available in small number of Shops for Training Data,
2. In case of Test Data, all the items are available in all the Unique 42 Shops.
Based on this, let's Classify them like:-
Category Number Defination Shop Count Range
Category 1 ('cat1') Low Avaibility of Items [0, 5)
Category 2 ('cat2') Moderately Available below Median [5, 15]
Category 3 ('cat3') Moderately Available above Median (15, 35)
Category 4 ('cat4') Highly Availablity of Items [35, 60)
Note: All the items for Test Data falls into Category 4. From the above Category, let's consider Category 1, 3 & 4 for Feature Engineering.
In [26]: %%time
def itemAvailble_caegory(shopCount):
if(shopCount < 5):
return('cat1')
elif(shopCount <= 15):
return('cat2')
elif(shopCount < 35):
return('cat3')
else:
return('cat4')
itemAvailibility_train['itemAvailble_category'] = itemAvailibility_train['shopCount'].apply(lambda x: itemAvailble_caegory(x))
# For test Data, since all the values will have Category 4 - so will directly append it to Dataset.
In [27]: itemAvailibility_train[[
'itemAC_cat1', 'itemAC_cat3', 'itemAC_cat4'
]] = pd.get_dummies(itemAvailibility_train['itemAvailble_category'])[['cat1', 'cat3', 'cat4']]
itemAvailibility_test['itemAC_cat1'] = 0
itemAvailibility_test['itemAC_cat3'] = 0
itemAvailibility_test['itemAC_cat4'] = 1
# Saving Output
itemAvailibility_train.to_csv('./output/itemAvailibility_train.csv', index=False)
itemAvailibility_test.to_csv('./output/itemAvailibility_test.csv', index=False)
Building Stats-Model for OLS Summary
Now, let's make a Matrix of Features Merging the Two Data: _trainData with itemAvailibilitytrain and _testData with itemAvailibilitytest and build a Stats Model to find if they are (if
any) an accurate predictor of Item Price.
In [35]: %%time
X_data = trainData[['shop_id', 'item_id', 'item_price']].merge(
itemAvailibility_train.drop('itemAvailble_category', axis=1), on='item_id', how='left').merge(
totalCount_uniqueItems_perShop_trainData.drop('shop_itemAvailibility', axis=1), on='shop_id', how='left')
X_data['constant_B0'] = 1 # Since in statsmodels.formula.api() the Intercept is not Included by Default
In [36]: X_optimumFeatures = X_data.iloc[:, [10, 3, 4, 5, 6, 7, 8, 9]]
y = X_data.iloc[:, 2]
In [37]: %%time
# OLS : Oridinary Least Square
regressor_OLS = sm.OLS(endog=y, exog=X_optimumFeatures).fit()
Wall time: 12 ms
Wall time: 1.29 s
Wall time: 1.7 s
In [38]: %%time
regressor_OLS.summary()
Conclusion: Generally, optimal value of is given by:
From the Feature Matrix, thus it is Concluded that all the Features are Required - and they are a good Predictor for Item Price.
Thus, considering the following Features, which seems to be a Good Item Count Predictor for Item Price!
In [40]: X_optimumFeatures.drop(u'constant_B0', axis=1).columns
Based on these Features, using the Multiple Linear Regression Model to Predict the Price of the Test Data - based on the Training Data of these Features.
In [66]: y = testData[['shop_id', 'item_id']].merge(
itemAvailibility_test, on='item_id', how='left').merge(
totalCount_uniqueItems_perShop_testData.drop('shop_itemAvailibility', axis=1), on='shop_id', how='left')
In [70]: %%time
linReg.fit(X_data[[u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4',
u'Count_uniqueItems', u'lowItemCount', u'highItemCount']], X_data['item_price'])
In [71]: predicted_itemPrice = linReg.predict(y[[u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4',
u'Count_uniqueItems', u'lowItemCount', u'highItemCount']])
In [73]: testData['item_price'] = predicted_itemPrice
In [76]: testData['date_block_num'] = 34
In [77]: print 'Column Names: ', [x for x in trainData.columns if x not in testData.columns]
# Saving Output File
testData.to_csv('./output/new_testData.csv', index=False)
P > |t|
SignificanceLevel(SL) <= 5%
Wall time: 1.5 s
Out[38]:
OLS Regression Results
Dep. Variable: item_price R-squared: 0.009
Model: OLS Adj. R-squared: 0.009
Method: Least Squares F-statistic: 3812.
Date: Thu, 24 May 2018 Prob (F-statistic): 0.00
Time: 12:30:36 Log-Likelihood: -2.6041e+07
No. Observations: 2935849 AIC: 5.208e+07
Df Residuals: 2935841 BIC: 5.208e+07
Df Model: 7
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
constant_B0 878.6534 9.454 92.941 0.000 860.124 897.183
shopCount 14.0939 0.202 69.846 0.000 13.698 14.489
itemAC_cat1 21.0296 10.311 2.039 0.041 0.819 41.240
itemAC_cat3 -432.4488 7.061 -61.249 0.000 -446.287 -418.610
itemAC_cat4 -349.8570 9.408 -37.188 0.000 -368.296 -331.418
Count_uniqueItems -0.0263 0.001 -32.429 0.000 -0.028 -0.025
lowItemCount -115.1687 5.146 -22.381 0.000 -125.254 -105.083
highItemCount 52.4302 4.360 12.026 0.000 43.886 60.975
Omnibus: 4945493.941 Durbin-Watson: 0.757
Prob(Omnibus): 0.000 Jarque-Bera (JB): 25209634895.407
Skew: 10.829 Prob(JB): 0.00
Kurtosis: 456.448 Cond. No. 1.41e+05
Out[40]: Index([u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4',
u'Count_uniqueItems', u'lowItemCount', u'highItemCount'],
dtype='object')
Wall time: 1.24 s
Out[70]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Column Names: ['date', 'item_cnt_day']
Finding the Item Price Features
In [78]: def priceChanged(listPrice):
# Returns 1 if there is a Change in Price of the Item, else 0
if(len(listPrice) > 1):
return 1
else:
return 0
Item Price - per Item
In [81]: %%time
#Train Data
itemPrice_perItem_trainData = trainData[[u'item_id',
u'item_price']].groupby(u'item_id').agg({'item_price' :
lambda x: list(set(x))}).reset_index()
itemPrice_perItem_trainData.rename(columns=({'item_price' : 'listPrice'}), inplace=True)
itemPrice_perItem_trainData['averagePrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.mean(x))
itemPrice_perItem_trainData['medianPrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.median(x))
itemPrice_perItem_trainData['stdPrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.std(x))
itemPrice_perItem_trainData['priceChanged_for_perItem'] = itemPrice_perItem_trainData['listPrice'].apply(
lambda x: priceChanged(x))
#Test Data
itemPrice_perItem_testData = testData[[u'item_id',
u'item_price']].groupby(u'item_id').agg({'item_price' :
lambda x: list(set(x))}).reset_index()
itemPrice_perItem_testData.rename(columns=({'item_price' : 'listPrice'}), inplace=True)
itemPrice_perItem_testData['averagePrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.mean(x))
itemPrice_perItem_testData['medianPrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.median(x))
itemPrice_perItem_testData['stdPrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.std(x))
itemPrice_perItem_testData['priceChanged_for_perItem'] = itemPrice_perItem_testData['listPrice'].apply(
lambda x: priceChanged(x))
# Saving Output File
itemPrice_perItem_trainData.to_csv('./output/itemPrice_perItem_trainData.csv', index=False)
itemPrice_perItem_testData.to_csv('./output/itemPrice_perItem_testData.csv', index=False)
Item Sale Plot
In [3]: %%time
itemSold = trainData[[u'date_block_num', u'item_cnt_day']].groupby(u'date_block_num').sum().reset_index()
itemSold.rename(columns={'date_block_num' : 'monthNumber',
'item_cnt_day' : 'itemSold_month'}, inplace=True)
Wall time: 4.54 s
Wall time: 253 ms
In [28]: %%time
plt.figure(figsize=(20, 7))
plt.rcParams.update({'font.size': 15})
plt.plot(itemSold['itemSold_month'], label='Item Sold')
plt.axhline(itemSold['itemSold_month'].mean(), color='black', linestyle='--', alpha=0.5, label='Mean Sale')
plt.title('Trend in Total Sale')
plt.xlabel('Month')
plt.ylabel('Sale')
plt.xticks(range(0, 35, 5))
plt.yticks(range(50000, 225000, 25000))
plt.legend()
In [31]: %%time
plt.figure(figsize=(20, 5))
plt.rcParams.update({'font.size': 12.5})
plt.plot(itemSold['itemSold_month'].rolling(window=12, center=False).mean(), label='Rolling Mean')
plt.plot(itemSold['itemSold_month'].rolling(window=12, center=False).std(), label='Rolling STD')
plt.legend()
Conclusion: Sale of the Item has a gradual Decaying Nature, so I'm going to Fit the Model using:
>> Linear Regression, and
>> Random Forest
Fitting Linear Regression Model
Preparing Data
In [93]: itemsData = pd.read_csv('./output/itemsData_categorised_v1.csv')
shopsData = pd.read_csv('./output/shopsData_categorised_v1.csv')
itemCategoryData = pd.read_csv('./output/itemCategoryDate_categorised_v1.csv')
In [114]: %%time
final_trainData = trainData.drop('date', axis=1).merge(
itemsData, on='item_id', how='left').merge(
shopsData[['shop_id', u'shopName_wordCount']], on='shop_id', how='left').merge(
itemCategoryData, on='item_category_id', how='left').merge(
itemPrice_perItem_trainData.drop('listPrice', axis=1), on='item_id')
final_testData = testData.merge(
itemsData, on='item_id', how='left').merge(
shopsData[['shop_id', u'shopName_wordCount']], on='shop_id', how='left').merge(
itemCategoryData, on='item_category_id', how='left').merge(
itemPrice_perItem_testData.drop('listPrice', axis=1), on='item_id')
Wall time: 53 ms
Wall time: 49 ms
Wall time: 3.2 s
# Saving Output to a File
final_trainData.to_csv('./output/final_trainData.csv', index=False)
final_testData.to_csv('./output/final_testData.csv', index=False)
In [117]: print 'Column Names: ', [x for x in final_trainData.columns if x not in final_testData.columns]
In [118]: final_trainData.columns
Linear Regression Model
In [119]: linReg.fit(final_trainData[[u'date_block_num', u'shop_id', u'item_id', u'item_price',
u'item_category_id', u'nameLen', u'shopName_wordCount',
u'itemCategory_nameStrength', u'averagePrice', u'medianPrice',
u'stdPrice', u'priceChanged_for_perItem']], final_trainData[u'item_cnt_day'])
In [120]: predicted_itemCount = linReg.predict(final_testData[[u'date_block_num', u'shop_id', u'item_id', u'item_price',
u'item_category_id', u'nameLen', u'shopName_wordCount',
u'itemCategory_nameStrength', u'averagePrice', u'medianPrice',
u'stdPrice', u'priceChanged_for_perItem']])
In [122]: %%time
resultantDict = {
'ID' : testData['ID'],
'item_cnt_month' : predicted_itemCount
}
linReg_output = pd.DataFrame(resultantDict)
linReg_output.to_csv('./output/finalOutput_linReg.csv', index=False)
Random Forest Classifier
In [3]: %%time
rndFor.fit(final_trainData[[u'date_block_num', u'shop_id', u'item_id', u'item_price',
u'item_category_id', u'nameLen', u'shopName_wordCount',
u'itemCategory_nameStrength', u'averagePrice', u'medianPrice',
u'stdPrice', u'priceChanged_for_perItem']], final_trainData[u'item_cnt_day'])
In [4]: predicted_itemCount_rndForest = rndFor.predict(final_testData[[u'date_block_num', u'shop_id', u'item_id', u'item_price',
u'item_category_id', u'nameLen', u'shopName_wordCount',
u'itemCategory_nameStrength', u'averagePrice', u'medianPrice',
u'stdPrice', u'priceChanged_for_perItem']])
In [7]: %%time
resultantDict = {
'ID' : testData['ID'],
'item_cnt_month' : predicted_itemCount_rndForest
}
rndFor_output = pd.DataFrame(resultantDict)
rndFor_output.to_csv('./output/finalOutput_rndFor.csv', index=False)
Column Names: ['item_cnt_day']
Out[118]: Index([u'date_block_num', u'shop_id', u'item_id', u'item_price',
u'item_cnt_day', u'item_category_id', u'nameLen', u'shopName_wordCount',
u'itemCategory_nameStrength', u'averagePrice', u'medianPrice',
u'stdPrice', u'priceChanged_for_perItem'],
dtype='object')
Out[119]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Wall time: 327 ms
Wall time: 1min 35s
Out[3]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
Wall time: 703 ms

More Related Content

Similar to Predicting Future Sale

Ruby on rails
Ruby on rails Ruby on rails
Ruby on rails Mohit Jain
ย 
ADBMS ASSIGNMENT
ADBMS ASSIGNMENTADBMS ASSIGNMENT
ADBMS ASSIGNMENTLori Moore
ย 
Feature Engineering in NLP.pdf
Feature Engineering in NLP.pdfFeature Engineering in NLP.pdf
Feature Engineering in NLP.pdfbilaje4244prolugcom
ย 
PPT on Data Science Using Python
PPT on Data Science Using PythonPPT on Data Science Using Python
PPT on Data Science Using PythonNishantKumar1179
ย 
interenship.pptx
interenship.pptxinterenship.pptx
interenship.pptxNaveen316549
ย 
Lerman Vvs14 Ef Tips And Tricks
Lerman Vvs14  Ef Tips And TricksLerman Vvs14  Ef Tips And Tricks
Lerman Vvs14 Ef Tips And TricksJulie Lerman
ย 
Python-for-Data-Analysis.pptx
Python-for-Data-Analysis.pptxPython-for-Data-Analysis.pptx
Python-for-Data-Analysis.pptxParveenShaik21
ย 
Elastic tire demo
Elastic tire demoElastic tire demo
Elastic tire demoScott Hamilton
ย 
Unit 3_Numpy_Vsp.pptx
Unit 3_Numpy_Vsp.pptxUnit 3_Numpy_Vsp.pptx
Unit 3_Numpy_Vsp.pptxprakashvs7
ย 
A Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiA Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiGrowth Intelligence
ย 
Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Wongnai
ย 
New Features of SQL Server 2016
New Features of SQL Server 2016New Features of SQL Server 2016
New Features of SQL Server 2016Mir Mahmood
ย 
Multidimensional Data Analysis with Ruby (sample)
Multidimensional Data Analysis with Ruby (sample)Multidimensional Data Analysis with Ruby (sample)
Multidimensional Data Analysis with Ruby (sample)Raimonds Simanovskis
ย 
ORM in Django
ORM in DjangoORM in Django
ORM in DjangoHoang Nguyen
ย 
Django ORM - Marcin Markiewicz
Django ORM - Marcin Markiewicz Django ORM - Marcin Markiewicz
Django ORM - Marcin Markiewicz Sunscrapers
ย 
20.1 Java working with abstraction
20.1 Java working with abstraction20.1 Java working with abstraction
20.1 Java working with abstractionIntro C# Book
ย 
In memory OLAP engine
In memory OLAP engineIn memory OLAP engine
In memory OLAP engineWO Community
ย 
Reactive Access to MongoDB from Scala
Reactive Access to MongoDB from ScalaReactive Access to MongoDB from Scala
Reactive Access to MongoDB from ScalaHermann Hueck
ย 

Similar to Predicting Future Sale (20)

Ruby on rails
Ruby on rails Ruby on rails
Ruby on rails
ย 
ADBMS ASSIGNMENT
ADBMS ASSIGNMENTADBMS ASSIGNMENT
ADBMS ASSIGNMENT
ย 
Feature Engineering in NLP.pdf
Feature Engineering in NLP.pdfFeature Engineering in NLP.pdf
Feature Engineering in NLP.pdf
ย 
PPT on Data Science Using Python
PPT on Data Science Using PythonPPT on Data Science Using Python
PPT on Data Science Using Python
ย 
interenship.pptx
interenship.pptxinterenship.pptx
interenship.pptx
ย 
Lerman Vvs14 Ef Tips And Tricks
Lerman Vvs14  Ef Tips And TricksLerman Vvs14  Ef Tips And Tricks
Lerman Vvs14 Ef Tips And Tricks
ย 
Python-for-Data-Analysis.pptx
Python-for-Data-Analysis.pptxPython-for-Data-Analysis.pptx
Python-for-Data-Analysis.pptx
ย 
Lecture 9.pptx
Lecture 9.pptxLecture 9.pptx
Lecture 9.pptx
ย 
Elastic tire demo
Elastic tire demoElastic tire demo
Elastic tire demo
ย 
Unit 3_Numpy_Vsp.pptx
Unit 3_Numpy_Vsp.pptxUnit 3_Numpy_Vsp.pptx
Unit 3_Numpy_Vsp.pptx
ย 
A Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiA Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with Luigi
ย 
Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)
ย 
New Features of SQL Server 2016
New Features of SQL Server 2016New Features of SQL Server 2016
New Features of SQL Server 2016
ย 
Multidimensional Data Analysis with Ruby (sample)
Multidimensional Data Analysis with Ruby (sample)Multidimensional Data Analysis with Ruby (sample)
Multidimensional Data Analysis with Ruby (sample)
ย 
ORM in Django
ORM in DjangoORM in Django
ORM in Django
ย 
Django ORM - Marcin Markiewicz
Django ORM - Marcin Markiewicz Django ORM - Marcin Markiewicz
Django ORM - Marcin Markiewicz
ย 
20.1 Java working with abstraction
20.1 Java working with abstraction20.1 Java working with abstraction
20.1 Java working with abstraction
ย 
Linq
LinqLinq
Linq
ย 
In memory OLAP engine
In memory OLAP engineIn memory OLAP engine
In memory OLAP engine
ย 
Reactive Access to MongoDB from Scala
Reactive Access to MongoDB from ScalaReactive Access to MongoDB from Scala
Reactive Access to MongoDB from Scala
ย 

Recently uploaded

OS-operating systems- ch04 (Threads) ...
OS-operating systems- ch04 (Threads) ...OS-operating systems- ch04 (Threads) ...
OS-operating systems- ch04 (Threads) ...Dr. Mazin Mohamed alkathiri
ย 
Earth Day Presentation wow hello nice great
Earth Day Presentation wow hello nice greatEarth Day Presentation wow hello nice great
Earth Day Presentation wow hello nice greatYousafMalik24
ย 
Planning a health career 4th Quarter.pptx
Planning a health career 4th Quarter.pptxPlanning a health career 4th Quarter.pptx
Planning a health career 4th Quarter.pptxLigayaBacuel1
ย 
What is Model Inheritance in Odoo 17 ERP
What is Model Inheritance in Odoo 17 ERPWhat is Model Inheritance in Odoo 17 ERP
What is Model Inheritance in Odoo 17 ERPCeline George
ย 
Hierarchy of management that covers different levels of management
Hierarchy of management that covers different levels of managementHierarchy of management that covers different levels of management
Hierarchy of management that covers different levels of managementmkooblal
ย 
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptx
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptxECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptx
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptxiammrhaywood
ย 
Grade 9 Q4-MELC1-Active and Passive Voice.pptx
Grade 9 Q4-MELC1-Active and Passive Voice.pptxGrade 9 Q4-MELC1-Active and Passive Voice.pptx
Grade 9 Q4-MELC1-Active and Passive Voice.pptxChelloAnnAsuncion2
ย 
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPT
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPTECONOMIC CONTEXT - LONG FORM TV DRAMA - PPT
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPTiammrhaywood
ย 
Influencing policy (training slides from Fast Track Impact)
Influencing policy (training slides from Fast Track Impact)Influencing policy (training slides from Fast Track Impact)
Influencing policy (training slides from Fast Track Impact)Mark Reed
ย 
Computed Fields and api Depends in the Odoo 17
Computed Fields and api Depends in the Odoo 17Computed Fields and api Depends in the Odoo 17
Computed Fields and api Depends in the Odoo 17Celine George
ย 
Quarter 4 Peace-education.pptx Catch Up Friday
Quarter 4 Peace-education.pptx Catch Up FridayQuarter 4 Peace-education.pptx Catch Up Friday
Quarter 4 Peace-education.pptx Catch Up FridayMakMakNepo
ย 
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptx
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptxMULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptx
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptxAnupkumar Sharma
ย 
ACC 2024 Chronicles. Cardiology. Exam.pdf
ACC 2024 Chronicles. Cardiology. Exam.pdfACC 2024 Chronicles. Cardiology. Exam.pdf
ACC 2024 Chronicles. Cardiology. Exam.pdfSpandanaRallapalli
ย 
How to Configure Email Server in Odoo 17
How to Configure Email Server in Odoo 17How to Configure Email Server in Odoo 17
How to Configure Email Server in Odoo 17Celine George
ย 
Procuring digital preservation CAN be quick and painless with our new dynamic...
Procuring digital preservation CAN be quick and painless with our new dynamic...Procuring digital preservation CAN be quick and painless with our new dynamic...
Procuring digital preservation CAN be quick and painless with our new dynamic...Jisc
ย 
How to do quick user assign in kanban in Odoo 17 ERP
How to do quick user assign in kanban in Odoo 17 ERPHow to do quick user assign in kanban in Odoo 17 ERP
How to do quick user assign in kanban in Odoo 17 ERPCeline George
ย 
Employee wellbeing at the workplace.pptx
Employee wellbeing at the workplace.pptxEmployee wellbeing at the workplace.pptx
Employee wellbeing at the workplace.pptxNirmalaLoungPoorunde1
ย 

Recently uploaded (20)

OS-operating systems- ch04 (Threads) ...
OS-operating systems- ch04 (Threads) ...OS-operating systems- ch04 (Threads) ...
OS-operating systems- ch04 (Threads) ...
ย 
Earth Day Presentation wow hello nice great
Earth Day Presentation wow hello nice greatEarth Day Presentation wow hello nice great
Earth Day Presentation wow hello nice great
ย 
Model Call Girl in Bikash Puri Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
Model Call Girl in Bikash Puri  Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”Model Call Girl in Bikash Puri  Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
Model Call Girl in Bikash Puri Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
ย 
Planning a health career 4th Quarter.pptx
Planning a health career 4th Quarter.pptxPlanning a health career 4th Quarter.pptx
Planning a health career 4th Quarter.pptx
ย 
What is Model Inheritance in Odoo 17 ERP
What is Model Inheritance in Odoo 17 ERPWhat is Model Inheritance in Odoo 17 ERP
What is Model Inheritance in Odoo 17 ERP
ย 
Hierarchy of management that covers different levels of management
Hierarchy of management that covers different levels of managementHierarchy of management that covers different levels of management
Hierarchy of management that covers different levels of management
ย 
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptx
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptxECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptx
ECONOMIC CONTEXT - PAPER 1 Q3: NEWSPAPERS.pptx
ย 
Grade 9 Q4-MELC1-Active and Passive Voice.pptx
Grade 9 Q4-MELC1-Active and Passive Voice.pptxGrade 9 Q4-MELC1-Active and Passive Voice.pptx
Grade 9 Q4-MELC1-Active and Passive Voice.pptx
ย 
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPT
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPTECONOMIC CONTEXT - LONG FORM TV DRAMA - PPT
ECONOMIC CONTEXT - LONG FORM TV DRAMA - PPT
ย 
Influencing policy (training slides from Fast Track Impact)
Influencing policy (training slides from Fast Track Impact)Influencing policy (training slides from Fast Track Impact)
Influencing policy (training slides from Fast Track Impact)
ย 
Computed Fields and api Depends in the Odoo 17
Computed Fields and api Depends in the Odoo 17Computed Fields and api Depends in the Odoo 17
Computed Fields and api Depends in the Odoo 17
ย 
Model Call Girl in Tilak Nagar Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
Model Call Girl in Tilak Nagar Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”Model Call Girl in Tilak Nagar Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
Model Call Girl in Tilak Nagar Delhi reach out to us at ๐Ÿ”9953056974๐Ÿ”
ย 
Quarter 4 Peace-education.pptx Catch Up Friday
Quarter 4 Peace-education.pptx Catch Up FridayQuarter 4 Peace-education.pptx Catch Up Friday
Quarter 4 Peace-education.pptx Catch Up Friday
ย 
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptx
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptxMULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptx
MULTIDISCIPLINRY NATURE OF THE ENVIRONMENTAL STUDIES.pptx
ย 
ACC 2024 Chronicles. Cardiology. Exam.pdf
ACC 2024 Chronicles. Cardiology. Exam.pdfACC 2024 Chronicles. Cardiology. Exam.pdf
ACC 2024 Chronicles. Cardiology. Exam.pdf
ย 
Rapple "Scholarly Communications and the Sustainable Development Goals"
Rapple "Scholarly Communications and the Sustainable Development Goals"Rapple "Scholarly Communications and the Sustainable Development Goals"
Rapple "Scholarly Communications and the Sustainable Development Goals"
ย 
How to Configure Email Server in Odoo 17
How to Configure Email Server in Odoo 17How to Configure Email Server in Odoo 17
How to Configure Email Server in Odoo 17
ย 
Procuring digital preservation CAN be quick and painless with our new dynamic...
Procuring digital preservation CAN be quick and painless with our new dynamic...Procuring digital preservation CAN be quick and painless with our new dynamic...
Procuring digital preservation CAN be quick and painless with our new dynamic...
ย 
How to do quick user assign in kanban in Odoo 17 ERP
How to do quick user assign in kanban in Odoo 17 ERPHow to do quick user assign in kanban in Odoo 17 ERP
How to do quick user assign in kanban in Odoo 17 ERP
ย 
Employee wellbeing at the workplace.pptx
Employee wellbeing at the workplace.pptxEmployee wellbeing at the workplace.pptx
Employee wellbeing at the workplace.pptx
ย 

Predicting Future Sale

  • 1. In [1]: # Computational & Basic Python Libraries import re import numpy as np import pandas as pd from collections import Counter # Visualization Libraries import seaborn as sns import matplotlib.pyplot as plt sns.set_style('whitegrid') %matplotlib inline # Machine Learning Libraries from sklearn.cluster import KMeans import statsmodels.formula.api as sm import scipy.cluster.hierarchy as sch from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import AgglomerativeClustering linReg = LinearRegression() rndFor = RandomForestClassifier() # Python NLTK Libraries from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import CountVectorizer PS = PorterStemmer() stopWords = set(stopwords.words("english")) stopWords_rus = set(stopwords.words("russian")) Importing Data All the texts are in Russian and I've no idea what the names are, so I converted everything to English using Translator() from Google Translator Library (https://pypi.org/project/googletrans/). Below is the Code for that, if anyone is interested! Notes The English Translated Texts were saved to Drive with the Following Commands. There was some encoding problems with the file, as few characters were not properly translated to English. # Google Transaltor Library from googletrans import Translator translator = Translator() # Shops Data shopsData['shopName'] = shopsData['shop_name'].apply(lambda x: translator.translate(x).text) shopsData[['shopName', 'shop_id']].to_csv('./data/shopsData_englishTranslated.csv', index=False, encoding='utf-8') # Item Category Data itemCategoryData['itemCategoryName'] = itemCategoryData['item_category_name'].apply(lambda x: translator.translate(x).text) itemCategoryData[['itemCategoryName', 'item_category_id']].to_csv('./data/itemCategory_englishTranslated.csv', index=False, encoding='utf-8') In case of Items Data it is advisable not to use the Translator as: >> There are 22,171 Item Name and Converting them will take a huge amount of Time! >> Translator() is not fully Stable and "ValueError Exception : No JSON Object could be decoded" was Generated. In [2]: itemsData = pd.read_csv('./data/items.csv', encoding='utf-8') shopsData = pd.read_csv('./data/shopsData_englishTranslated.csv') itemCategoryData = pd.read_csv('./data/itemCategory_englishTranslated.csv') In [2]: trainData = pd.read_csv('./data/sales_train_v2.csv') testData = pd.read_csv('./data/test.csv')
  • 2. In [28]: print trainData.info(), 'n' print testData.info(), 'n' print 'Column Names: ', [x for x in trainData.columns if x not in testData.columns] Categorization of Data Intution Behind: Generally people tend to buy stuffs from their Favourite Brand and also from their Favourite Store - depending on the store's performance and reputation. From the given dataset, we have no idea about these information, but lets try to categorize the Shops and the Items on some other Features - using only their Names. From one of the Link (https://www.shopify.in/blog/4660242-what-should-i-name-my-online-store) I've found on the internet, here are the few things that a Succesful Business have: 1. Short & Simple Name 2. Different from the Rest 3. Web Presence of the Business, etc. Since, most of these points are not possible in case of our analysis - let's just focus on Name. Shop Names: Using K-Means Clustering on the Shop Name - with two Independent Variables: Word Count and Character Count in Shop Names, after omiting special characters. Item Category Name: For classification of Item Category Name, I will use some Text Classification like: >> Finding the list of Top 25 Most Used Words in the Category Name, then >> Making a list of those words and passing it as a Parameter to CountVectorizer() under Text Feature Extraction to find the Name Strength of Item Category Name. Item Name: Only Focusing on the Name of the Item, passing the Item Name Length as a Parameter for Machine Learning Algorithm, after removing the Stop Words from the Name. Finally, for each Categorizations - we use: data[['Category_1', 'Category_2']] = pd.get_dummies(data, drop_first=True, columns=['Category_1', 'Category_2']) In [6]: def textLengthCount(text): # This Function takes in the Text - Clears the text of Special Characters and Returns Word Count and Character Count cleanText = re.sub('[^A-Za-z0-9 ]+', '', text) wordCount = len(cleanText.split()) return(wordCount, len(cleanText) - wordCount -1) <class 'pandas.core.frame.DataFrame'> RangeIndex: 2935849 entries, 0 to 2935848 Data columns (total 6 columns): date object date_block_num int64 shop_id int64 item_id int64 item_price float64 item_cnt_day float64 dtypes: float64(2), int64(3), object(1) memory usage: 134.4+ MB None <class 'pandas.core.frame.DataFrame'> RangeIndex: 214200 entries, 0 to 214199 Data columns (total 3 columns): ID 214200 non-null int64 shop_id 214200 non-null int64 item_id 214200 non-null int64 dtypes: int64(3) memory usage: 4.9 MB None Column Names: ['date', 'date_block_num', 'item_price', 'item_cnt_day']
  • 3. In [7]: %%time shopsData['shopName_wordCount'] = shopsData['shopName'].apply(lambda x: textLengthCount(x)[0]) shopsData['shopName_lenCount'] = shopsData['shopName'].apply(lambda x: textLengthCount(x)[1]) X = shopsData.iloc[:, [2, 3]].values # Creating an 2D-Array for K-Means WCSS = [] # for Shops Name : from the Chart we can Clearly figure out that the Optimal No. of Category(K) = 3 for i in range(1, 7): kMeans_Cluster = KMeans(n_clusters= i, init='k-means++') kMeans_Cluster.fit(X) WCSS.append(kMeans_Cluster.inertia_) # Visualizing the WCSS Value plt.figure(figsize=(15, 5)) plt.plot(range(1, 7), WCSS, 'o-') plt.xticks(range(1, 7)) plt.title('Within Center Sum of Squared Value') plt.xlabel('No. of Clusters') plt.ylabel('WCSS') In [8]: %%time kMeans_Cluster = KMeans(n_clusters=3) y_kMeans = kMeans_Cluster.fit_predict(X) In [9]: # Visualizing the Shop Category plt.figure(figsize=(15, 8)) plt.scatter(X[y_kMeans == 0, 0], X[y_kMeans == 0, 1], c='red', label='Small Name Length') plt.scatter(X[y_kMeans == 1, 0], X[y_kMeans == 1, 1], c='cyan', label='Average Name Length') plt.scatter(X[y_kMeans == 2, 0], X[y_kMeans == 2, 1], c='blue', label='Large Name Length') plt.scatter(kMeans_Cluster.cluster_centers_[:, 0], kMeans_Cluster.cluster_centers_[:, 1], s=100, c='black', label='Centroids') plt.title('Shop Classification using K-Means') plt.xlabel('Word Count in Shop Name') plt.ylabel('Character Count in Shop Name') plt.legend() Wall time: 898 ms Wall time: 32 ms Out[9]: <matplotlib.legend.Legend at 0x10955d30>
  • 4. In [71]: %%time # Creating Shops Category from the Predicted Result, and appending it to the Data Set shopsData['shopCategory'] = y_kMeans shopsData[['shop_Cat0', 'shop_Cat1', 'shop_Cat2']] = pd.get_dummies(shopsData['shopCategory']) In [13]: %%time itemCategoryData['itemCategoryName_Cleaned'] = itemCategoryData['itemCategoryName'].apply( lambda x: str(re.sub('[^A-Za-z0-9]', ' ', x)).replace(' ', ' ')).apply( lambda x: PS.stem(x)).apply( lambda x: ' '.join([i for i in word_tokenize(x)if i not in stopWords])) In [14]: %%time allWords = [] for i in range(len(itemCategoryData)): tempWords = word_tokenize(itemCategoryData['itemCategoryName_Cleaned'][i]) for j in tempWords: if(j == 'game'): # Stemmer was not able to Distinguish 'games' and 'game' as a Single Word j = 'games' allWords.append(j) mostCommonWords = [] for i in Counter(allWords).most_common(25): mostCommonWords.append(i[0]) print 'List of Most Common Word Item Category Name:n', Counter(allWords).most_common(25), 'n' In [15]: %%time # Passing the Most Common Words as Vocabulary and Building the Name Strength Meter countVectorizer = CountVectorizer(vocabulary=mostCommonWords) itemCategoryData['itemCategory_nameStrength'] = np.array(countVectorizer .fit_transform(itemCategoryData['itemCategoryName_Cleaned']) .toarray()).mean(axis=1) In [16]: %%time itemsData['nameLen'] = itemsData['item_name'].apply( lambda x: ' '.join([i for i in word_tokenize(x)if i not in stopWords_rus])).apply(len) # Saving Output itemsData.drop('item_name', axis=1).to_csv('./output/itemsData_categorised_v1.csv', index=False) shopsData.drop(['shopName', 'shopCategory'], axis=1).to_csv('./output/shopsData_categorised_v1.csv', index=False) itemCategoryData.drop(['itemCategoryName', 'itemCategoryName_Cleaned'], axis=1).to_csv('./output/itemCategoryDate_categorised_v1.csv', index=False) Validating the Categories with Item Count After Categorization of the Data - checking if these Categories can correctly predict Item Sale for a Month Daily - based on the mean of Item Sale per Item, per Shop. For this, I'm using P-Value to determine how well these categories define the sale of an Item. In [38]: %%time avgItemSale_perShop = trainData[[u'shop_id', u'item_id', u'item_cnt_day']].groupby( [u'shop_id', u'item_id']).mean().reset_index() In [74]: %%time # X_dataTemp : temporary variable for Building OLS Summary X_dataTemp = avgItemSale_perShop.merge( itemsData.drop('item_name', axis=1), on='item_id', how='left').merge( shopsData.drop(['shopName', 'shopCategory'], axis=1), on='shop_id', how='left').merge( itemCategoryData[['item_category_id', 'itemCategory_nameStrength']], on='item_category_id', how='left') X_dataTemp['constant_B0'] = 1 # Since in statsmodels.formula.api() the Intercept is not Included by Default In [91]: X_optimumFeatures = X_dataTemp.iloc[:, [11, 4, 5, 10]] y = X_dataTemp.iloc[:, 2] In [92]: %%time # OLS : Oridinary Least Square regressor_OLS = sm.OLS(endog=y, exog=X_optimumFeatures).fit() Wall time: 2 ms Wall time: 207 ms List of Most Common Word Item Category Name: [('games', 24), ('books', 12), (u'gifts', 12), ('accessories', 8), ('consoles', 8), ('music', 7), ('xbox', 6), ('programs', 6), (u'p c', 5), ('cards', 5), ('cinema', 5), ('payment', 5), (u'edit', 4), ('ps2', 3), ('ps3', 3), ('ps4', 3), ('digit', 3), ('psp', 3), ('p svita', 3), ('blu', 3), ('360', 3), (u'literatur', 3), ('ray', 3), (u'offic', 2), (u'board', 2)] Wall time: 21 ms Wall time: 4 ms Wall time: 4.73 s Wall time: 655 ms Wall time: 341 ms Wall time: 102 ms
  • 5. In [93]: %%time regressor_OLS.summary() Conclusion: Generally for Backward Elimination Process, optimal value of is is to be under: From the OLS Summary, shopName_lenCount : , thus not considering the Feature. But, while considering Shop Category (by using drop_first=True) to minimise Dummy Variable Trap, , from which it can be concluded, that Categorization of the Shop based on Name is not a Good Idea! Thus, considering the following Features, which seems to be a Good Item Count Predictor for a Month! In [95]: X_optimumFeatures.drop(u'constant_B0', axis=1).columns Classification of Item Price An Item's Price should be a very important deterministic factor while considering sale of the Item on a Daily Basis. Exploring Training & Testing Data based on Item Price, Shop ID and Item ID. In [106]: %%time print 'Train Datan==========' print 'No. of Unique Shops:', trainData['shop_id'].nunique() print 'No. of Unique Items:', trainData['item_id'].nunique() print 'nTest Datan=========' print 'No. of Unique Shops:', testData['shop_id'].nunique() print 'No. of Unique Items:', testData['item_id'].nunique() print 'n' print 'Train-Test Data Mis-Match Count:', len([x for x in trainData['item_id'].unique() if x not in testData['item_id'].unique()]) print 'Test-Train Data Mis-Match Count:', len([x for x in testData['item_id'].unique() if x not in trainData['item_id'].unique()]) print 'n' P > |t| SignificanceLevel(SL) <= 5% P > |t| = 0.581, SL > 5% SL > 5% Wall time: 121 ms Out[93]: OLS Regression Results Dep. Variable: item_cnt_day R-squared: 0.001 Model: OLS Adj. R-squared: 0.001 Method: Least Squares F-statistic: 174.6 Date: Wed, 23 May 2018 Prob (F-statistic): 4.16e-113 Time: 15:03:01 Log-Likelihood: -6.5911e+05 No. Observations: 424124 AIC: 1.318e+06 Df Residuals: 424120 BIC: 1.318e+06 Df Model: 3 Covariance Type: nonrobust coef std err t P>|t| [0.025 0.975] constant_B0 1.0604 0.007 142.870 0.000 1.046 1.075 nameLen 0.0010 8.58e-05 12.089 0.000 0.001 0.001 shopName_wordCount -0.0212 0.002 -13.330 0.000 -0.024 -0.018 itemCategory_nameStrength 0.6110 0.052 11.749 0.000 0.509 0.713 Omnibus: 2612880.482 Durbin-Watson: 1.895 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1286148985691768.750 Skew: 468.684 Prob(JB): 0.00 Kurtosis: 269778.319 Cond. No. 1.33e+03 Out[95]: Index([u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength'], dtype='object') Train Data ========== No. of Unique Shops: 60 No. of Unique Items: 21807 Test Data ========= No. of Unique Shops: 42 No. of Unique Items: 5100 Train-Test Data Mis-Match Count: 17070 Test-Train Data Mis-Match Count: 363 Wall time: 4min 33s
  • 6. Conclusion: 1. As we can see there is a Mis-match in the number of Items among Training & Testing Data. 2. Most of the Items are Present in Training Data, 3. However, there are 363 unique and new Items in test data - which is not Present in Training Set. 4. Also, there are no new Shops in Testing Data. One Important Criterion which seems as an Important Deciding Factor is the Classification of Item ID with Shop ID. To Classify the Item Price, let's do the Following things: 1. Classifying the Shops, based on the Number of Items Availible 2. Classifying the Items, based on their Availibility 3. Finding the mean(), median() and std() price of the Item, on the basis of: a. per Item, per Shop, per Month b. Combining the above parameters, and finding new Categories Classifying the Shops, based on the Number of Items Availible Intution Behind: Shops with more number of items might attract more amount of Customers - hence more amount of sale! In [12]: %%time totalCount_uniqueItems_perShop_trainData = trainData[['shop_id', 'item_id']].groupby('shop_id').item_id.nunique().reset_index().rename( columns={ 'item_id' : 'Count_uniqueItems' }) totalCount_uniqueItems_perShop_testData = testData[['shop_id', 'item_id']].groupby('shop_id').item_id.nunique().reset_index().rename( columns={ 'item_id' : 'Count_uniqueItems' }) In [13]: %%time plt.figure(figsize=(15, 5)) plt.scatter(totalCount_uniqueItems_perShop_trainData['shop_id'], totalCount_uniqueItems_perShop_trainData['Count_uniqueItems']) plt.title('Scatter Plot of Shop ID and No. of Unique Items') plt.xlabel('Shop ID') plt.ylabel('Total No. of Unique Items per Shop') In [14]: X = totalCount_uniqueItems_perShop_trainData.iloc[:, :].values Wall time: 787 ms Wall time: 54 ms
  • 7. In [15]: %%time plt.figure(figsize=(25, 10)) plt.rcParams.update({'font.size': 15}) dendoGram = sch.dendrogram(sch.linkage(X, method='ward')) plt.axhline(y=15000, color='black', linestyle='-') plt.axhline(y=12500, color='black', linestyle='--') plt.axhline(y=17500, color='black', linestyle='--') plt.title('Dendo Grams') plt.xlabel('Clusters/Points') plt.ylabel('Euclidean Distance') plt.yticks([5000, 10000, 12500, 15000, 17500, 20000, 25000, 30000]) From the Dendogram: The Optimal Number of Cluster is : 3 Based on the Optimal Number of Categories, categorising the Shops, based on the Number of Items available - for both Train & Test Data. In [16]: X_testData = totalCount_uniqueItems_perShop_testData.iloc[:, :].values In [17]: %%time agg_HC = AgglomerativeClustering(n_clusters=3) # Affinity & Linkage is set to Default X_HC_trainData = agg_HC.fit_predict(X) X_HC_testData = agg_HC.fit_predict(X_testData) In [18]: plt.figure(figsize=(15, 5)) plt.rcParams.update({'font.size': 10}) plt.scatter(X[X_HC_trainData == 0, 0], X[X_HC_trainData == 0, 1], c = 'red', label = 'Low Items') plt.scatter(X[X_HC_trainData == 1, 0], X[X_HC_trainData == 1, 1], c = 'blue', label = 'High Items') plt.scatter(X[X_HC_trainData == 2, 0], X[X_HC_trainData == 2, 1], c = 'green', label = 'Average Items') plt.title('Cluster of Shop ID and No. of Unique Items') plt.xlabel('Shop ID') plt.ylabel('Total No. of Unique Items per Shop') plt.legend(loc=4) In [19]: totalCount_uniqueItems_perShop_trainData['shop_itemAvailibility'] = X_HC_trainData totalCount_uniqueItems_perShop_testData['shop_itemAvailibility'] = X_HC_testData T hreshold(ฮ˜) = 15000units Wall time: 184 ms Wall time: 1e+03 ยตs Out[18]: <matplotlib.legend.Legend at 0xecef5f8>
  • 8. In [29]: totalCount_uniqueItems_perShop_trainData[['lowItemCount', 'highItemCount']] = pd.get_dummies( totalCount_uniqueItems_perShop_trainData['shop_itemAvailibility'])[[0, 1]] totalCount_uniqueItems_perShop_testData[['lowItemCount', 'highItemCount']] = pd.get_dummies( totalCount_uniqueItems_perShop_testData['shop_itemAvailibility'])[[0, 1]] # Saving Output totalCount_uniqueItems_perShop_trainData.to_csv('./output/totalCount_uniqueItems_perShop_trainData.csv', index=False) totalCount_uniqueItems_perShop_testData.to_csv('./output/totalCount_uniqueItems_perShop_testData.csv', index=False) Classifying the Items Based on their Availibilty Intution Behind: Items which are widely available should have a distributed selling, but incase the item is available at small amount of Shops, sell of those items from those shops should be high, or vice-versa and Machine Learning Model should be able to Pick up that Parameter! In [22]: %%time itemAvailibility_train = trainData[['shop_id', 'item_id']].groupby('item_id').shop_id.nunique().reset_index() itemAvailibility_train = itemAvailibility_train.rename(columns={ 'shop_id' : 'shopCount' }) itemAvailibility_test = testData[['shop_id', 'item_id']].groupby('item_id').shop_id.nunique().reset_index() itemAvailibility_test = itemAvailibility_test.rename(columns={ 'shop_id' : 'shopCount' }) In [25]: %%time plt.figure(figsize=(15, 6)) plt.rcParams.update({'font.size': 12.5}) plt.plot(itemAvailibility_train.groupby('shopCount').item_id.nunique().reset_index()['item_id'].apply(lambda x: x/2000.0), label='Distribution (2e^-3)') plt.boxplot(itemAvailibility_train['shopCount'], showmeans=True, vert=False) plt.axvline(itemAvailibility_train['shopCount'].median(), color='black', linestyle='--', alpha=0.5) plt.axvline(itemAvailibility_train['shopCount'].std(), color='cyan', linestyle='--', alpha=0.5, label='Standard Deviation') plt.axvline(3.0, color='black', linestyle='--', alpha=0.5) plt.axvline(34.0, color='black', linestyle='--', alpha=0.5) plt.axvline(42.0, color='magenta', linestyle='--', alpha=0.5, label='Test Data Shop Count') plt.title('Distribution of Shop Count') plt.xlabel('Shop Count per Item') plt.ylabel('No. of Shops per the Variations of Shop Count') plt.xticks([0, 5, 10, itemAvailibility_train['shopCount'].median(), itemAvailibility_train['shopCount'].mean(), 25, 30, 35, 40, 45, 50, 55, 60]) plt.yticks([0, 2]) plt.legend() Wall time: 711 ms Wall time: 91 ms
  • 9. Based on the above Graph, we can conclude the following: 1. Most of the Items are available in small number of Shops for Training Data, 2. In case of Test Data, all the items are available in all the Unique 42 Shops. Based on this, let's Classify them like:- Category Number Defination Shop Count Range Category 1 ('cat1') Low Avaibility of Items [0, 5) Category 2 ('cat2') Moderately Available below Median [5, 15] Category 3 ('cat3') Moderately Available above Median (15, 35) Category 4 ('cat4') Highly Availablity of Items [35, 60) Note: All the items for Test Data falls into Category 4. From the above Category, let's consider Category 1, 3 & 4 for Feature Engineering. In [26]: %%time def itemAvailble_caegory(shopCount): if(shopCount < 5): return('cat1') elif(shopCount <= 15): return('cat2') elif(shopCount < 35): return('cat3') else: return('cat4') itemAvailibility_train['itemAvailble_category'] = itemAvailibility_train['shopCount'].apply(lambda x: itemAvailble_caegory(x)) # For test Data, since all the values will have Category 4 - so will directly append it to Dataset. In [27]: itemAvailibility_train[[ 'itemAC_cat1', 'itemAC_cat3', 'itemAC_cat4' ]] = pd.get_dummies(itemAvailibility_train['itemAvailble_category'])[['cat1', 'cat3', 'cat4']] itemAvailibility_test['itemAC_cat1'] = 0 itemAvailibility_test['itemAC_cat3'] = 0 itemAvailibility_test['itemAC_cat4'] = 1 # Saving Output itemAvailibility_train.to_csv('./output/itemAvailibility_train.csv', index=False) itemAvailibility_test.to_csv('./output/itemAvailibility_test.csv', index=False) Building Stats-Model for OLS Summary Now, let's make a Matrix of Features Merging the Two Data: _trainData with itemAvailibilitytrain and _testData with itemAvailibilitytest and build a Stats Model to find if they are (if any) an accurate predictor of Item Price. In [35]: %%time X_data = trainData[['shop_id', 'item_id', 'item_price']].merge( itemAvailibility_train.drop('itemAvailble_category', axis=1), on='item_id', how='left').merge( totalCount_uniqueItems_perShop_trainData.drop('shop_itemAvailibility', axis=1), on='shop_id', how='left') X_data['constant_B0'] = 1 # Since in statsmodels.formula.api() the Intercept is not Included by Default In [36]: X_optimumFeatures = X_data.iloc[:, [10, 3, 4, 5, 6, 7, 8, 9]] y = X_data.iloc[:, 2] In [37]: %%time # OLS : Oridinary Least Square regressor_OLS = sm.OLS(endog=y, exog=X_optimumFeatures).fit() Wall time: 12 ms Wall time: 1.29 s Wall time: 1.7 s
  • 10. In [38]: %%time regressor_OLS.summary() Conclusion: Generally, optimal value of is given by: From the Feature Matrix, thus it is Concluded that all the Features are Required - and they are a good Predictor for Item Price. Thus, considering the following Features, which seems to be a Good Item Count Predictor for Item Price! In [40]: X_optimumFeatures.drop(u'constant_B0', axis=1).columns Based on these Features, using the Multiple Linear Regression Model to Predict the Price of the Test Data - based on the Training Data of these Features. In [66]: y = testData[['shop_id', 'item_id']].merge( itemAvailibility_test, on='item_id', how='left').merge( totalCount_uniqueItems_perShop_testData.drop('shop_itemAvailibility', axis=1), on='shop_id', how='left') In [70]: %%time linReg.fit(X_data[[u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4', u'Count_uniqueItems', u'lowItemCount', u'highItemCount']], X_data['item_price']) In [71]: predicted_itemPrice = linReg.predict(y[[u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4', u'Count_uniqueItems', u'lowItemCount', u'highItemCount']]) In [73]: testData['item_price'] = predicted_itemPrice In [76]: testData['date_block_num'] = 34 In [77]: print 'Column Names: ', [x for x in trainData.columns if x not in testData.columns] # Saving Output File testData.to_csv('./output/new_testData.csv', index=False) P > |t| SignificanceLevel(SL) <= 5% Wall time: 1.5 s Out[38]: OLS Regression Results Dep. Variable: item_price R-squared: 0.009 Model: OLS Adj. R-squared: 0.009 Method: Least Squares F-statistic: 3812. Date: Thu, 24 May 2018 Prob (F-statistic): 0.00 Time: 12:30:36 Log-Likelihood: -2.6041e+07 No. Observations: 2935849 AIC: 5.208e+07 Df Residuals: 2935841 BIC: 5.208e+07 Df Model: 7 Covariance Type: nonrobust coef std err t P>|t| [0.025 0.975] constant_B0 878.6534 9.454 92.941 0.000 860.124 897.183 shopCount 14.0939 0.202 69.846 0.000 13.698 14.489 itemAC_cat1 21.0296 10.311 2.039 0.041 0.819 41.240 itemAC_cat3 -432.4488 7.061 -61.249 0.000 -446.287 -418.610 itemAC_cat4 -349.8570 9.408 -37.188 0.000 -368.296 -331.418 Count_uniqueItems -0.0263 0.001 -32.429 0.000 -0.028 -0.025 lowItemCount -115.1687 5.146 -22.381 0.000 -125.254 -105.083 highItemCount 52.4302 4.360 12.026 0.000 43.886 60.975 Omnibus: 4945493.941 Durbin-Watson: 0.757 Prob(Omnibus): 0.000 Jarque-Bera (JB): 25209634895.407 Skew: 10.829 Prob(JB): 0.00 Kurtosis: 456.448 Cond. No. 1.41e+05 Out[40]: Index([u'shopCount', u'itemAC_cat1', u'itemAC_cat3', u'itemAC_cat4', u'Count_uniqueItems', u'lowItemCount', u'highItemCount'], dtype='object') Wall time: 1.24 s Out[70]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) Column Names: ['date', 'item_cnt_day']
  • 11. Finding the Item Price Features In [78]: def priceChanged(listPrice): # Returns 1 if there is a Change in Price of the Item, else 0 if(len(listPrice) > 1): return 1 else: return 0 Item Price - per Item In [81]: %%time #Train Data itemPrice_perItem_trainData = trainData[[u'item_id', u'item_price']].groupby(u'item_id').agg({'item_price' : lambda x: list(set(x))}).reset_index() itemPrice_perItem_trainData.rename(columns=({'item_price' : 'listPrice'}), inplace=True) itemPrice_perItem_trainData['averagePrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.mean(x)) itemPrice_perItem_trainData['medianPrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.median(x)) itemPrice_perItem_trainData['stdPrice'] = itemPrice_perItem_trainData['listPrice'].apply(lambda x: np.std(x)) itemPrice_perItem_trainData['priceChanged_for_perItem'] = itemPrice_perItem_trainData['listPrice'].apply( lambda x: priceChanged(x)) #Test Data itemPrice_perItem_testData = testData[[u'item_id', u'item_price']].groupby(u'item_id').agg({'item_price' : lambda x: list(set(x))}).reset_index() itemPrice_perItem_testData.rename(columns=({'item_price' : 'listPrice'}), inplace=True) itemPrice_perItem_testData['averagePrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.mean(x)) itemPrice_perItem_testData['medianPrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.median(x)) itemPrice_perItem_testData['stdPrice'] = itemPrice_perItem_testData['listPrice'].apply(lambda x: np.std(x)) itemPrice_perItem_testData['priceChanged_for_perItem'] = itemPrice_perItem_testData['listPrice'].apply( lambda x: priceChanged(x)) # Saving Output File itemPrice_perItem_trainData.to_csv('./output/itemPrice_perItem_trainData.csv', index=False) itemPrice_perItem_testData.to_csv('./output/itemPrice_perItem_testData.csv', index=False) Item Sale Plot In [3]: %%time itemSold = trainData[[u'date_block_num', u'item_cnt_day']].groupby(u'date_block_num').sum().reset_index() itemSold.rename(columns={'date_block_num' : 'monthNumber', 'item_cnt_day' : 'itemSold_month'}, inplace=True) Wall time: 4.54 s Wall time: 253 ms
  • 12. In [28]: %%time plt.figure(figsize=(20, 7)) plt.rcParams.update({'font.size': 15}) plt.plot(itemSold['itemSold_month'], label='Item Sold') plt.axhline(itemSold['itemSold_month'].mean(), color='black', linestyle='--', alpha=0.5, label='Mean Sale') plt.title('Trend in Total Sale') plt.xlabel('Month') plt.ylabel('Sale') plt.xticks(range(0, 35, 5)) plt.yticks(range(50000, 225000, 25000)) plt.legend() In [31]: %%time plt.figure(figsize=(20, 5)) plt.rcParams.update({'font.size': 12.5}) plt.plot(itemSold['itemSold_month'].rolling(window=12, center=False).mean(), label='Rolling Mean') plt.plot(itemSold['itemSold_month'].rolling(window=12, center=False).std(), label='Rolling STD') plt.legend() Conclusion: Sale of the Item has a gradual Decaying Nature, so I'm going to Fit the Model using: >> Linear Regression, and >> Random Forest Fitting Linear Regression Model Preparing Data In [93]: itemsData = pd.read_csv('./output/itemsData_categorised_v1.csv') shopsData = pd.read_csv('./output/shopsData_categorised_v1.csv') itemCategoryData = pd.read_csv('./output/itemCategoryDate_categorised_v1.csv') In [114]: %%time final_trainData = trainData.drop('date', axis=1).merge( itemsData, on='item_id', how='left').merge( shopsData[['shop_id', u'shopName_wordCount']], on='shop_id', how='left').merge( itemCategoryData, on='item_category_id', how='left').merge( itemPrice_perItem_trainData.drop('listPrice', axis=1), on='item_id') final_testData = testData.merge( itemsData, on='item_id', how='left').merge( shopsData[['shop_id', u'shopName_wordCount']], on='shop_id', how='left').merge( itemCategoryData, on='item_category_id', how='left').merge( itemPrice_perItem_testData.drop('listPrice', axis=1), on='item_id') Wall time: 53 ms Wall time: 49 ms Wall time: 3.2 s
  • 13. # Saving Output to a File final_trainData.to_csv('./output/final_trainData.csv', index=False) final_testData.to_csv('./output/final_testData.csv', index=False) In [117]: print 'Column Names: ', [x for x in final_trainData.columns if x not in final_testData.columns] In [118]: final_trainData.columns Linear Regression Model In [119]: linReg.fit(final_trainData[[u'date_block_num', u'shop_id', u'item_id', u'item_price', u'item_category_id', u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength', u'averagePrice', u'medianPrice', u'stdPrice', u'priceChanged_for_perItem']], final_trainData[u'item_cnt_day']) In [120]: predicted_itemCount = linReg.predict(final_testData[[u'date_block_num', u'shop_id', u'item_id', u'item_price', u'item_category_id', u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength', u'averagePrice', u'medianPrice', u'stdPrice', u'priceChanged_for_perItem']]) In [122]: %%time resultantDict = { 'ID' : testData['ID'], 'item_cnt_month' : predicted_itemCount } linReg_output = pd.DataFrame(resultantDict) linReg_output.to_csv('./output/finalOutput_linReg.csv', index=False) Random Forest Classifier In [3]: %%time rndFor.fit(final_trainData[[u'date_block_num', u'shop_id', u'item_id', u'item_price', u'item_category_id', u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength', u'averagePrice', u'medianPrice', u'stdPrice', u'priceChanged_for_perItem']], final_trainData[u'item_cnt_day']) In [4]: predicted_itemCount_rndForest = rndFor.predict(final_testData[[u'date_block_num', u'shop_id', u'item_id', u'item_price', u'item_category_id', u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength', u'averagePrice', u'medianPrice', u'stdPrice', u'priceChanged_for_perItem']]) In [7]: %%time resultantDict = { 'ID' : testData['ID'], 'item_cnt_month' : predicted_itemCount_rndForest } rndFor_output = pd.DataFrame(resultantDict) rndFor_output.to_csv('./output/finalOutput_rndFor.csv', index=False) Column Names: ['item_cnt_day'] Out[118]: Index([u'date_block_num', u'shop_id', u'item_id', u'item_price', u'item_cnt_day', u'item_category_id', u'nameLen', u'shopName_wordCount', u'itemCategory_nameStrength', u'averagePrice', u'medianPrice', u'stdPrice', u'priceChanged_for_perItem'], dtype='object') Out[119]: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) Wall time: 327 ms Wall time: 1min 35s Out[3]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) Wall time: 703 ms