-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_impute.py
More file actions
executable file
·152 lines (110 loc) · 4.5 KB
/
data_impute.py
File metadata and controls
executable file
·152 lines (110 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
'''
Imputation:
Helps in filling up the null values
Method1:
Removal of null rows
Method2:
Filling null values with specified values
Method3:
Filling null values with average values
'''
import pandas as pd
import numpy as np
class Imputer:
def __init__(self,df):
self.df=df
def colm_rem(self,colm):
#Removes the column from the dataset
print("Removing the null value rows of "+colm)
temp_df=self.df[pd.notnull(self.df[colm])]
print(temp_df.describe())
print("\n Do you want to keep the changes[0/1]\n")
colm_rem_inp=int(input())
if colm_rem_inp==1:
print("updating column")
self.df=temp_df
return
def colm_fill(self,colm,colm_type):
#Fills the column with given value
print("You can fill the column with element of your choice")
if colm_type=="obj":
fill_with = input("Enter the value to fill with")
else:
fill_with = int(input("Enter the value to fill with"))
self.df[colm] = self.df[colm].fillna(fill_with)
return
def colm_avg(self,colm):
#fills the column with avg data
print("Filling the nan values with the average of the column\n")
self.df[colm] = self.df[colm].fillna(self.df[colm].mean())
return
def colm_median(self,colm):
#fills the column with median of data
print("Filling the nan values with the median of the column\n")
self.df[colm] = self.df[colm].fillna(self.df[colm].median())
return
def colm_mode(self,colm):
#fills the column with mode of data
print("Filling the nan values with the mode of the column\n")
self.df[colm] = self.df[colm].fillna(self.df[colm].mode()[0])
return
def suggest_imp(self,colm_names):
#loops through all the column and asks for imputation if needed
for colm in colm_names:
colm_null=sum(pd.isnull(self.df[colm]))
if colm_null==0:
print("Column Name - "+colm + " has no null values")
else:
if self.df[colm].dtype=="object":
print(colm + " is of object type\n")
print(colm_null)
print('''1. Column Removal
\n2. Fill with some value
\n3. Fill Mode value
\n4.Ignore\n''')
removal_inp=int(input())
if removal_inp==1:
self.colm_rem(colm)
elif removal_inp==2:
self.colm_fill(colm,"obj")
elif removal_inp==3:
self.colm_mode(colm)
else:
print("Ignoring "+colm+" imputation")
else:
print(colm + " is of numeric type\n")
print(colm_null)
print('''1. Column Removal
\n2. Fill with some value
\n3. Fill Average value
\n4. Fill Median value
\n5. Fill Mode value
\n6.Ignore\n''')
removal_inp = int(input())
if removal_inp==1:
self.colm_rem(colm)
elif removal_inp==2:
self.colm_fill(colm,"num")
elif removal_inp==3:
self.colm_avg(colm)
elif removal_inp==4:
self.colm_median(colm)
elif removal_inp==5:
self.colm_mode(colm)
else:
print("Ignoring "+colm+" imputation")
continue
input("Press enter to move to next column")
return self.df
def impute(self):
#Landing function from data_intake
colm_names=list(self.df.columns.values)
#Printing null count of the columns
print("column - "+"Null Count")
for colm in colm_names:
print(str(colm)+" - "+str(sum(pd.isnull(self.df[colm]))))
procede_inp=input("Enter -1 to return , press any key to impute")
if procede_inp=="-1":
return
self.df=self.suggest_imp(colm_names)
return self.df