2. Dataset information
2
libararies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
def load_data(dataframe):
global car_data
car_data = pd.read_csv("C:/Users/hamzapc/Desktop/train1.csv")
return car_data
load_data('car_data')
Function that return dataframe
4. def check_data(data):
"""a function to examine data for cleaning"""
print(data.info())
print('n')
print(f'Columns in the dataframe:n{data.columns}')
print('n')
print(f'There are {data.shape} rows and column in the dataframe.')
print('n')
# duplicate value
duplicate = data.duplicated().sum()
if duplicate == 0:
print(f'There are {duplicate} duplicate rows in the dataframe')
else:
print(f'There are {duplicate} duplicate rows in the dataframe')
print('n')
# missing value
nan_data = pd.DataFrame(data.isna().sum())
if nan_data[0].sum() > 1:
fig = px.bar(nan_data,
title='Missing values',
width=600,
height=500,
orientation='h',
color_discrete_sequence=['darkgreen'])
fig.show()
else:
print('No missing value.')
check_data(car_data)
4
Function that check missing value and dubplicated
5. Columns in the dataframe:
Index(['Price', 'Levy', 'make', 'Model', 'year', 'Category',
'Leather interior', 'Fuel', 'Engine_volume', 'Mileage', 'Cylinders',
'Gear _box _type', 'Drive wheels', 'Doors', 'Wheel', 'Color',
'Airbags'],
dtype='object')
There are (19237, 17) rows and column in the dataframe.
There are 3512 duplicate rows in the dataframe
No missing value.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 17 columns): 5
Output of Function that check missing value and duplicated
7. Duplicate sample:
Price Levy make Model year Category Leather interior
192 314 1053 MERCEDES-BENZ E 350 2014 Sedan Yes
239 2901 503 HONDA Civic 2012 Sedan Yes
264 392 1017 MERCEDES-BENZ E 300 2017 Sedan Yes
331 282 289 FORD Escape 2008 Jeep Yes
347 39829 1811 LEXUS GX 460 2010 Jeep Yes
Fuel Engine_volume Mileage Cylinders Gear _box _type Drive wheels
192 Diesel 3.5 149486 km 6 Automatic 4x4
239 Hybrid 1.5 146403 km 4 Automatic Front
264 Petrol 2 1600 km 4 Automatic Rear
331 Hybrid 0.4 220474 km 4 Automatic Front
347 Petrol 4.6 155821 km 8 Automatic 4x4
Doors Wheel Color Airbags
192 4-May Left wheel Silver 12
239 4-May Left wheel White 0
264 4-May Left wheel Black 12
331 4-May Left wheel Grey 0
347 4-May Left wheel Black 0
7
Output of Function that remove duplicated value
10. function give doors of cars
def door():
print("door type number= ", car_data['Doors'].nunique())
print("door type ", car_data['Doors'].unique())
print('n')
door()
10
door type number= 3
door type ['4-May' '2-Mar' '>5']
output
11. Function give The most color & wheel type & fuel type
def best():
plt.figure(figsize=(14, 8))
plt.subplot(1, 3, 1)
plt1 = car_data.Color.value_counts().plot(kind='bar')
plt.title('Colors Histogram')
plt1.set(xlabel='Colors', ylabel='Frequency of company')
plt.subplot(1, 3, 2)
plt1 = car_data.Wheel.value_counts().plot(kind='bar')
plt.title('wheel Type Histogram')
plt1.set(xlabel='wheel Type', ylabel='Frequency of wheel type')
plt.subplot(1, 3, 3)
plt1 = car_data.Fuel.value_counts().plot(kind='bar')
plt.title('fuel type Histogram')
plt1.set(xlabel='fuel', ylabel='Frequency of Car type')
plt.show()
best()
11
12. Function search I want car
12
def search(com, model, fuel, color, geer, year):
com_data = car_data.loc[(car_data['make'] == com) & (car_data['Model'] == model) & (car_data['Fuel'] == fuel)
& (car_data['Color'] == color) & (car_data['Gear _box _type'] == geer)
& (car_data['year'] == year)]
print(com_data)
#search('TOYOTA','Camry','Petrol','White','Automatic',2020)
Price Levy make Model year Category Leather interior Fuel
39829 1323 TOYOTA Camry 2020 Sedan Yes Petrol
Engine_volume Mileage Cylinders Gear _box _type Drive Doors
2.5 4130 km 4 Automatic Front 4-May
Wheel Color Airbags
Left wheel White 12
14. Function give the graph company that make cars
def best_maker():
plt.figure(figsize=(14, 8))
plt1 = car_data.make.value_counts().plot(kind='bar')
plt11 = car_data.make.value_counts()
print(plt11)
plt.title('Car Options Histogram')
plt1.set(xlabel='Car Options', ylabel='Frequency of Car type')
plt.show()
14
24. This function give the number of cars per year and per price and per gear box
def seabornplot2():
plt.subplot(3,1,1)
sb.histplot(data=car_data,x="year")
plt.subplot(3,1,2)
sb.histplot(data=car_data[:50], x="Price")
plt.subplot(3,1,3)
sb.histplot(data=car_data, x="Gear _box _type")
plt.show()
seabornplot2()
24
25. Output function give the number of cars per year and per price and per gear box
25
26. This function give year`s making for knowledge companies for you and
number of cars at this year
def seaborn2():
com_data = car_data[(car_data['make'] == 'TOYOTA') | (car_data['make'] == 'HYUNDAI') | (car_data['make'] == 'HONDA')
| (car_data['make'] == 'LEXUS') | (car_data['make'] == 'NISSAN')
| (car_data['make'] == 'KIA') | (car_data['make'] == 'OPEL') | (car_data['make'] == 'VOLVO')]
n_car=com_data.groupby(['year'])['make'].value_counts()[:30]
n_car.plot(kind='barh')
plt.show()
seaborn2()
26
28. Relation between price of car and company
28
sb.catplot(x="make", y="Price", data=dat)
plt.show()
29. Relation between engine volume and price for 10rows only
29
dat=com_data[:10]
print(dat)
sb.displot(data=dat,x='Engine_volume',col='Price')
plt.show()