import csv, random, math # import csv,random,math module
import statistics as st # import statistics module
from statistics import stdev # from statistics library import
stdev method
1
# Loading the Data from csv file of
pima-indians-diabetes dataset.
def loadCsv(filename):
lines = csv.reader(open(filename, "r"));
#open csv file in read mode and read file
dataset = list(lines)
for i in range(len(dataset)):
# converting the attributes from string to floating point numbers
dataset[i] = [float(x) for x in dataset[i]]
return dataset
2
Note:
• dataset contains 768 observations of medical details for
Pima Indian patients.
• The attributes are :
1. Pregnancies
2. Glucose
3. Blood pressure
4. Skin thickness
5. Insulin
6. BMI
7. Diabetic pedigree function
8. Age
Outcome(diabetes) 1 or 0
3
def splitDataset(dataset, splitRatio):
testSize = int(len(dataset) * splitRatio); (768*0.2)=153
trainSet = list(dataset); // All 768 examples
testSet = [] // testSet is initially empty
while len(testSet) < testSize: #untill len(testSet) becomes 153
#randomly pick an instance from training data
index = random.randrange(len(trainSet));
testSet.append(trainSet.pop(index))
return [trainSet, testSet]
#Function will split dataset into trainingSet and testSet
#trainSet contains initially 768 examples
#randomly pick 153 instances from trainSet, remove it and
place it in testSet
i.e trainSet+testSet=768
4
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
x = dataset[i] # x contains row of dataset
if (x[-1] not in separated):
separated[x[-1]] = []
separated[x[-1]].append(x)
return separated
#Function separateByClass groups training set into 2 classes.
1 group contains all positive instances.
1 group contains all negative instances.
#separated= dictionary, key 1 value= all positive classification
instances, key=0, value= all negative classification instances
• #Function to categorize the dataset in terms of classes (1 and 0)
• #The function assumes that the last attribute (-1) is the class value.
• x[-1]=1 or 0
5
6
separated is a dictionary with key = 1 and 0
When key=1, values are all positive instances
When key=0, values are all negative instances
7
separated is a dictionary with key values 1 and 0
When key=1, values are all positive instances
When key=0, values are all negative instances
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Function to calculate mean
8
def stdev(numbers):
avg=mean(numbers)
variance=sum([pow(x-avg,2) for x in
numbers])/float(len(numbers)-1)
return math.sqrt(variance)
# Function to calculate standard deviation
9
# square root of variance is standard deviation
def compute_mean_std(dataset):
mean_std = [(st.mean(attribute), st.stdev(attribute))for attribute in
zip(*dataset)];
del mean_std[-1] # Exclude mean and standard deviation of last
column of dataset (i.e label 1 and 0)
return mean_std
# All class 1 data are zipped. For all 8 attributes calculate mean and
standard deviation.
# All class 0 data are zipped. For all 8 attributes calculate mean and
standard deviation.
# the function compute_mean computes the mean and standard
deviation of 8 attributes for both positive and negative instances
10
Mean, standard deviation of 8 attributes for class 1 and 0
class 1 data mean and standard deviation
class 0 data mean and standard deviation
11
1. Pregnancies
2. Glucose
3. Blood pressure
4. Skin thickness
5. Insulin
6. BMI
7. Diabetic pedigree function
8. Age
def summarizeByClass(dataset): dataset=TrainingSet
separated = separateByClass(dataset)
#The function separateByClass(dataset) will divide training set into 2
classes 1).All instances with class 1 (Having diabetes) 2) All
instances with class 0 (Not having diabetes)
12
#separated= dictionary, When key=0 value= all
positive classification instances, When key=1,
value= all negative classification instances
summary = {} # to store mean and standard
deviation of +ve and -ve instances
for classValue, instances in separated.items():
summary[classValue] = compute_mean_std(instances)
return summary
#summary is a dictionary of tuples(mean,std) for each
class value 13
for classValue, instances in separated.items():
#summary is a dictionary of tuples(mean,std) for each class value
summary[classValue] = compute_mean_std(instances)
return summary
classValue =1, instances= all positive class instances
classValue=0, instances=all negative class instances
14
separated is a dictionary with key values 1 and 0
When key=1, values are all positive instances
When key=0, values are all negative instances
summary[classValue] = compute_mean_std(instances)
summary[1]=mean and standard deviation of all positive instances
summary[0]=mean and standard deviation of all negative instances
15
def estimateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
# Calculate probability density function
16
def calculateClassProbabilities(summaries, testVector):
p = {} testinstance
#class and attribute information as mean and standard deviation
for classValue, classSummaries in summaries.items():
p[classValue] = 1 #Initially p[1]=1 and p[0]=0
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = testVector[i] #testvector's first attribute
p[classValue] *= estimateProbability(x, mean, stdev);
return p
#classValue will be 1 and 0
#classSummaries contains mean and standard deviation of each
class
#summaries contains mean and standard deviation of 8 attributes of
2 classes. 17
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
18
x = testVector[i] #testSet’s first attribute
p[classValue] *= estimateProbability(x, mean, stdev);
19
Function look for the largest probability and return the associated class
def predict(summaries, testVector):
testVector= testSet instance i.e each row of testSet
all_p = calculateClassProbabilities(summaries, testVector)
#return probabilities of each class (1 and 0)
20
bestLabel, bestProb = None, -1
#initially bestLabel=None, bestProb=-1
for lbl, p in all_p.items():
# lbl will contain 1 and 0
#p contains probability value
if bestLabel is None or p > bestProb:
bestProb = p
bestLabel = lbl
return bestLabel
# compare probability values and return class label
with highest probability
21
if bestLabel is None or p > bestProb:
bestProb = p
bestLabel = lbl
return bestLabel
# Compare 2 probability values and return label
associated with highest probability
22
def perform_classification(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
// Function perform_classification return predictions
of all testSets (153)
23
def perform_classification(summaries, testSet):
predictions = [] # predictions list will store predicted
values of 153 testsets
# initially predictions list is empty
summaries contains mean and standard deviation of
8 attributes of 2 classes.
Class 1
Class 0
24
for i in range(len(testSet)): # for each testSet instance
result = predict(summaries, testSet[i])
#calculate prediction (1 or 0) and store in result
predictions.append(result)
#append each prediction result to list predictions
return predictions
25
# Function which return predictions for list of predictions #
For each instance
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return(correct/float(len(testSet))) * 100.0
Function takes 2 arguments testSet and predictions
• If testSet classification value matches with predictions
value of that testSet, then prediction is correct. Then
increment correct by 1.
• Accuracy=how many testSets are correctly predicted by
model.
• Accuracy=(no of correct predictions/153)*100 26
dataset = loadCsv('pima-indians-diabetes.csv');
#loadCsv function is called and entire dataset is stored in
dataset
print('Pima Indian Diabetes Dataset loaded...')
print('Total instances available :',len(dataset))
#length of dataset is 768 (number of rows)
print('Total attributes present :',len(dataset[0])-1)
#Attributes of dataset is 8 .
#Last column contains target value (diabetes-Yes/No)
27
for i in range(5):
print(i+1 , ':' , dataset[i])
# Print 1st
5 rows of dataset
28
29
Pregnanci
es Glucose
Blood
pressure
Skin
thickne
ss
Insuli
n BMI
Diabetic
pedigre
e
function Age
Outcome(diabe
tes)
6 148 72 35 0 33.6 0.627 50 1
1 85 66 29 0 26.6 0.351 31 0
8 183 64 0 0 23.3 0.672 32 1
1 89 66 23 94 28.1 0.167 21 0
0 137 40 35 168 43.1 2.288 33 1
First 5 rows of dataset
splitRatio = 0.2
trainingSet, testSet = splitDataset(dataset, splitRatio)
#Function splitDataset split dataset into trainingSet
and testSet
# trainingSet contains 615 examples and testSet
contains 153 examples
print('nDataset is split into training and testing set.')
print('Training examples = {0} nTesting examples =
{1}'.format(len(trainingSet), len(testSet)))
# Display number of trainingSet examples and testSet
examples.
768 *0.2=153 testSet examples
615 trainingSet examples
30
• summaries = summarizeByClass(trainingSet)
summaries contains mean and standard deviation of
8 attributes of 2 classes.
Class 1
Class 0 mean.standard deviation
#prepare model
31
predictions = perform_classification(summaries, testSet)
153 instances
32
Using summaries (i.e model) testSet is predicted. i.e 153
examples are predicted using 615 trainingSet examples
• predictions = perform_classification(summaries,
testSet)
Outcome of each testSet(153) is predicted
33
• accuracy = getAccuracy(testSet, predictions)
• print('nAccuracy of the Naive Baysian Classifier is :',
accuracy)
# getAccuracy function will calculate the accuracy of
predictions of testSet
34
Training Set
35
8 examples
3 attributes
36
Standard Deviation
37
Standard Deviation
38
Standard Deviation
Std Std Std
39
40
Std Std Std
41

lab program 6.pdf

  • 1.
    import csv, random,math # import csv,random,math module import statistics as st # import statistics module from statistics import stdev # from statistics library import stdev method 1
  • 2.
    # Loading theData from csv file of pima-indians-diabetes dataset. def loadCsv(filename): lines = csv.reader(open(filename, "r")); #open csv file in read mode and read file dataset = list(lines) for i in range(len(dataset)): # converting the attributes from string to floating point numbers dataset[i] = [float(x) for x in dataset[i]] return dataset 2
  • 3.
    Note: • dataset contains768 observations of medical details for Pima Indian patients. • The attributes are : 1. Pregnancies 2. Glucose 3. Blood pressure 4. Skin thickness 5. Insulin 6. BMI 7. Diabetic pedigree function 8. Age Outcome(diabetes) 1 or 0 3
  • 4.
    def splitDataset(dataset, splitRatio): testSize= int(len(dataset) * splitRatio); (768*0.2)=153 trainSet = list(dataset); // All 768 examples testSet = [] // testSet is initially empty while len(testSet) < testSize: #untill len(testSet) becomes 153 #randomly pick an instance from training data index = random.randrange(len(trainSet)); testSet.append(trainSet.pop(index)) return [trainSet, testSet] #Function will split dataset into trainingSet and testSet #trainSet contains initially 768 examples #randomly pick 153 instances from trainSet, remove it and place it in testSet i.e trainSet+testSet=768 4
  • 5.
    def separateByClass(dataset): separated ={} for i in range(len(dataset)): x = dataset[i] # x contains row of dataset if (x[-1] not in separated): separated[x[-1]] = [] separated[x[-1]].append(x) return separated #Function separateByClass groups training set into 2 classes. 1 group contains all positive instances. 1 group contains all negative instances. #separated= dictionary, key 1 value= all positive classification instances, key=0, value= all negative classification instances • #Function to categorize the dataset in terms of classes (1 and 0) • #The function assumes that the last attribute (-1) is the class value. • x[-1]=1 or 0 5
  • 6.
    6 separated is adictionary with key = 1 and 0 When key=1, values are all positive instances When key=0, values are all negative instances
  • 7.
    7 separated is adictionary with key values 1 and 0 When key=1, values are all positive instances When key=0, values are all negative instances
  • 8.
  • 9.
    def stdev(numbers): avg=mean(numbers) variance=sum([pow(x-avg,2) forx in numbers])/float(len(numbers)-1) return math.sqrt(variance) # Function to calculate standard deviation 9 # square root of variance is standard deviation
  • 10.
    def compute_mean_std(dataset): mean_std =[(st.mean(attribute), st.stdev(attribute))for attribute in zip(*dataset)]; del mean_std[-1] # Exclude mean and standard deviation of last column of dataset (i.e label 1 and 0) return mean_std # All class 1 data are zipped. For all 8 attributes calculate mean and standard deviation. # All class 0 data are zipped. For all 8 attributes calculate mean and standard deviation. # the function compute_mean computes the mean and standard deviation of 8 attributes for both positive and negative instances 10
  • 11.
    Mean, standard deviationof 8 attributes for class 1 and 0 class 1 data mean and standard deviation class 0 data mean and standard deviation 11 1. Pregnancies 2. Glucose 3. Blood pressure 4. Skin thickness 5. Insulin 6. BMI 7. Diabetic pedigree function 8. Age
  • 12.
    def summarizeByClass(dataset): dataset=TrainingSet separated= separateByClass(dataset) #The function separateByClass(dataset) will divide training set into 2 classes 1).All instances with class 1 (Having diabetes) 2) All instances with class 0 (Not having diabetes) 12
  • 13.
    #separated= dictionary, Whenkey=0 value= all positive classification instances, When key=1, value= all negative classification instances summary = {} # to store mean and standard deviation of +ve and -ve instances for classValue, instances in separated.items(): summary[classValue] = compute_mean_std(instances) return summary #summary is a dictionary of tuples(mean,std) for each class value 13
  • 14.
    for classValue, instancesin separated.items(): #summary is a dictionary of tuples(mean,std) for each class value summary[classValue] = compute_mean_std(instances) return summary classValue =1, instances= all positive class instances classValue=0, instances=all negative class instances 14 separated is a dictionary with key values 1 and 0 When key=1, values are all positive instances When key=0, values are all negative instances
  • 15.
    summary[classValue] = compute_mean_std(instances) summary[1]=meanand standard deviation of all positive instances summary[0]=mean and standard deviation of all negative instances 15
  • 16.
    def estimateProbability(x, mean,stdev): exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent # Calculate probability density function 16
  • 17.
    def calculateClassProbabilities(summaries, testVector): p= {} testinstance #class and attribute information as mean and standard deviation for classValue, classSummaries in summaries.items(): p[classValue] = 1 #Initially p[1]=1 and p[0]=0 for i in range(len(classSummaries)): mean, stdev = classSummaries[i] x = testVector[i] #testvector's first attribute p[classValue] *= estimateProbability(x, mean, stdev); return p #classValue will be 1 and 0 #classSummaries contains mean and standard deviation of each class #summaries contains mean and standard deviation of 8 attributes of 2 classes. 17
  • 18.
    for i inrange(len(classSummaries)): mean, stdev = classSummaries[i] 18
  • 19.
    x = testVector[i]#testSet’s first attribute p[classValue] *= estimateProbability(x, mean, stdev); 19
  • 20.
    Function look forthe largest probability and return the associated class def predict(summaries, testVector): testVector= testSet instance i.e each row of testSet all_p = calculateClassProbabilities(summaries, testVector) #return probabilities of each class (1 and 0) 20
  • 21.
    bestLabel, bestProb =None, -1 #initially bestLabel=None, bestProb=-1 for lbl, p in all_p.items(): # lbl will contain 1 and 0 #p contains probability value if bestLabel is None or p > bestProb: bestProb = p bestLabel = lbl return bestLabel # compare probability values and return class label with highest probability 21
  • 22.
    if bestLabel isNone or p > bestProb: bestProb = p bestLabel = lbl return bestLabel # Compare 2 probability values and return label associated with highest probability 22
  • 23.
    def perform_classification(summaries, testSet): predictions= [] for i in range(len(testSet)): result = predict(summaries, testSet[i]) predictions.append(result) return predictions // Function perform_classification return predictions of all testSets (153) 23
  • 24.
    def perform_classification(summaries, testSet): predictions= [] # predictions list will store predicted values of 153 testsets # initially predictions list is empty summaries contains mean and standard deviation of 8 attributes of 2 classes. Class 1 Class 0 24
  • 25.
    for i inrange(len(testSet)): # for each testSet instance result = predict(summaries, testSet[i]) #calculate prediction (1 or 0) and store in result predictions.append(result) #append each prediction result to list predictions return predictions 25
  • 26.
    # Function whichreturn predictions for list of predictions # For each instance def getAccuracy(testSet, predictions): correct = 0 for i in range(len(testSet)): if testSet[i][-1] == predictions[i]: correct += 1 return(correct/float(len(testSet))) * 100.0 Function takes 2 arguments testSet and predictions • If testSet classification value matches with predictions value of that testSet, then prediction is correct. Then increment correct by 1. • Accuracy=how many testSets are correctly predicted by model. • Accuracy=(no of correct predictions/153)*100 26
  • 27.
    dataset = loadCsv('pima-indians-diabetes.csv'); #loadCsvfunction is called and entire dataset is stored in dataset print('Pima Indian Diabetes Dataset loaded...') print('Total instances available :',len(dataset)) #length of dataset is 768 (number of rows) print('Total attributes present :',len(dataset[0])-1) #Attributes of dataset is 8 . #Last column contains target value (diabetes-Yes/No) 27
  • 28.
    for i inrange(5): print(i+1 , ':' , dataset[i]) # Print 1st 5 rows of dataset 28
  • 29.
    29 Pregnanci es Glucose Blood pressure Skin thickne ss Insuli n BMI Diabetic pedigre e functionAge Outcome(diabe tes) 6 148 72 35 0 33.6 0.627 50 1 1 85 66 29 0 26.6 0.351 31 0 8 183 64 0 0 23.3 0.672 32 1 1 89 66 23 94 28.1 0.167 21 0 0 137 40 35 168 43.1 2.288 33 1 First 5 rows of dataset
  • 30.
    splitRatio = 0.2 trainingSet,testSet = splitDataset(dataset, splitRatio) #Function splitDataset split dataset into trainingSet and testSet # trainingSet contains 615 examples and testSet contains 153 examples print('nDataset is split into training and testing set.') print('Training examples = {0} nTesting examples = {1}'.format(len(trainingSet), len(testSet))) # Display number of trainingSet examples and testSet examples. 768 *0.2=153 testSet examples 615 trainingSet examples 30
  • 31.
    • summaries =summarizeByClass(trainingSet) summaries contains mean and standard deviation of 8 attributes of 2 classes. Class 1 Class 0 mean.standard deviation #prepare model 31
  • 32.
    predictions = perform_classification(summaries,testSet) 153 instances 32 Using summaries (i.e model) testSet is predicted. i.e 153 examples are predicted using 615 trainingSet examples
  • 33.
    • predictions =perform_classification(summaries, testSet) Outcome of each testSet(153) is predicted 33
  • 34.
    • accuracy =getAccuracy(testSet, predictions) • print('nAccuracy of the Naive Baysian Classifier is :', accuracy) # getAccuracy function will calculate the accuracy of predictions of testSet 34
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.