缺失值处理
import pandas as pda import numpy as npy import matplotlib.pylab as pyl # data=pda.read_excel("D:/taobao2.xls") def index(data): data = pda.DataFrame(data[1:],columns=data[0]) print(data) data["价格"][(data["价格"]==0)]=None print(data) x=0 for i in data.columns: for j in range(len(data)): if(data[i].isnull())[j]: data[i][j]=data["价格"].mean() x+=1 print(x) if __name__ == "__main__": data = nosupervision_read_data() index(data)
数据离散化处理
#离散化 #连续型数据离散化 #等宽离散化 import pandas as pda import numpy as npy import matplotlib.pylab as pyl # data=pda.read_excel("D:/taobao2.xls") def index(data): data = pda.DataFrame(data[1:], columns=data[0]) da=data.values price=da[:,2] price.sort() print(price) k=5 c1=pda.cut(price,k,labels=["太便宜","便宜","适中","贵","太贵"]) print(c1) #指点区间离散化 k=[0,50,100,price.max()] print(k) c2=pda.cut(price,k,labels=["非常便宜","适中","贵"]) print(c2) if __name__ == "__main__": data = nosupervision_read_data() index(data)
数据集成处理
# -*- coding:utf-8 -*- # 异常值处理 import pandas as pda import numpy as npy def index(data): # 输出结果必须为字典output output = {} # data = pda.read_excel("D:/taobao2.xls") data = pda.DataFrame(data[1:], columns=data[0]) # print(data) da = data.values # 数据集成 da1 = da[0:10] da2 = da[10:20] da3 = npy.concatenate((da1, da2)) pda.DataFrame(da3) output[‘data_数据集成‘] = pda.DataFrame(da3).values.tolist() print(pda.DataFrame(da1)) print(pda.DataFrame(da2)) print(pda.DataFrame(da3)) print(output) return output if __name__ == "__main__": data = nosupervision_read_data() index(data)
原文:https://www.cnblogs.com/wei23/p/10890609.html