import pandas as pd
data.to_csv("路径",encoding=utf-8) 保存文件
data.head() 查看前几行 data.tail() 查看后几行 data.shape 几行几列 data.index 查看索引 data.columns 查看标题 data.values 查看值 data.info 查看整体结构 data.describe() 对数值型数据进行描述统计 data.value_counts()对值计数 data.sort_index(axis=1/0) 对索引进行排序 参数ascending=False 降序排序 data.sort_calues(by="columens") 按照某一行的值进行排序 inplace=True 修改原始数据
选取数据 data.columns data["cloumns"] data.loc[] 显示索引 data.iloc[] 隐示索引
## 映射函数 data.apply()
#data.apply(abs) 将abs作用于data的每一个数据 data=pd.Series([1,-2,3,-3],index=["a","b","c","d"]) data.apply(abs)
data.iloc[[0,2],data.columns.get_loc("one")] 混合索引 0-2 行 one列 data.iloc[[0,2],data.columns.get_indexer(["one","tow"])] 同时得到2列
布儿型索引
| 代表 or & 代表 and -代表not
data[(df.one>0)&(df.two>0)] data[(df.one>0)|(df.two>0)] data[(df.one>0)-(df.two>0)]
用与筛选需要的子集
import numpy as np ser=pd.Series(np.arange(5),index=np.arange(5)[::-1],dtype="int32") ser
ser.isin([2,3,4])#查看是否存在 2,3,4
###随机抽样 sample()方法从Series或者DataFriame中随机选择行或列 ser.sample()
参数 n=None, #抽取多少个 frac=None, #抽取多少比列 replace=False, #是否为有放回抽样 weights=None, #设定每一行的权重 random_state=None, #是否需要重现随机的结果,设置随机数种子 axis=None # 设定是对行采样还是对列采样
ser=pd.Series([1,2,3,4,5])
ser
ser.sample()#默认抽取一个
ser.sample(4)#默认抽取4个
ser.sample(frac=0.8)#抽取80%
#不加参数默认进行不放回抽样,使用replace 替换抽样方式 ser.sample(n=5,replace=False)# 不放回
ser.sample(n=5,replace=True)#有放回
ser_weight=[0.1,0.2,0.2,0.3,0.4] ser.sample(n=4,weights=ser_weight ) #总体权重和为1 如果输入的值不为一,会从新归一化
#在采样中,会用DataFrame的某一列作为权重 df=pd.DataFrame({"first":[4,5,6,7],"weight_column":[0.3,0.4,0.2,0.1]}) df
df.sample(n=2,weights="weight_column")
df.sample(n=2,axis=1)
df.sample(n=2,random_state=2)
?df.sample
df1=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[0,1,2,3]) df2=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[4,5,6,7]) df3=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[8,9,10,11])
In [41]:
print(df1);print(df2);print(df3)
###用pd.concat()合并对象 参数 pd.concat() objs, 数据集 axis=0, 轴线 默认0 join=‘outer‘, 连接方式 inner outer join_axes=None, 用指定的轴进行合并 ignore_index=False,都合并没有就不合并 /True 根据列字段对齐合并,生成新的索引 keys=None, 指定不同数据源 levels=None, names=None, verify_integrity=False, copy=True)
pd.concat([df1,df2,df3])#列合并
df4=pd.DataFrame({"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "E":["E0","E1","E4","E5"]},index=[0,1,4,5]) df4
pd.concat([df1,df4],axis=1)#横向合并
pd.concat([df1,df4],axis=1,join="inner")#取交集
pd.concat([df1,df4],axis=1,join_axes=[df1.index])#指定合并的轴
pd.concat([df1,df4],ignore_index=False)
pd.concat([df1,df4],ignore_index=True)#生成新的index
Out[53]:
ser=pd.Series(["s0","s1","s2","s3"],name="s") ser
pd.concat([df1,ser],axis=1)#合并之后Series的名称自动成为列名称,不指定name自动生成
pd.concat([df1,df2,df3],keys=["one","two","three"])#区分不同的数据来源
data=pd.concat([df1,df2,df3]) dic={"one":df1,"two":df2,"three":df3} pd.concat(dic) #也可以区分不同的数据集
df.append()
df1.append(df4)
df1.append([df2,df3])
#用append方法添加新行
ser3=pd.Series(["q1","q2","q3","q4"],index=["A","B","C","D"]) ser3
df1.append(ser3,ignore_index=True)
import pandas as pd
df=pd.read_csv("taobao.csv",encoding="gbk") df.head()
df.tail(10)
df.info()
Out[88]:
#查看描述统计信息
df.describe()
#行的选择 df[0:5] df.iloc[0:5]
#列的选择
cols=df[["宝贝","价格"]] type(cols) cols.head() cols=df[["宝贝","价格"]].head()#数据太多读取太慢,可选择只查看多少行 cols
#区域的选择
df.loc[0:3,["宝贝","价格"]] df.loc[df.index[0:3],["宝贝","价格"]]
df["销售额"]=df["价格"]*df["成交量"] df.head()
#过滤掉价格>=100,成交量<8000的商品信息 df[(df["价格"]<100)&(df["成交量"]>=8000)]
#将位置设置为索引 #df.index=df["位置"] df1=df.set_index("位置") df1.head()
#排序
df2=df1.sort_index()
df2.head()
#两个索引
df3=df.set_index(["位置","品牌"]) df3.head() #并根据位置进行排序 #df4=df3.sort_index(level=0) df4=df3.sort_index(level="位置") df4.head()
#删除不需要的数据 deal=df.drop(["宝贝","品牌","位置"],axis=1) deal.head() #inplace=Fals 不修改原始数据 True 修改原始数据
deal.groupby("位置").mean()#均值
df["成交量"].groupby(df["位置"]).mean()
df["成交量"].groupby([df["位置"],df["品牌"]]).mean() #按多组列进行分组
#创建数据 df1=df[20:30][["位置","品牌"]] df1.head()
df2=df[25:35][["品牌","价格","成交量"]] df2.head()
df2.info()
#pd.merge 根据一个或多个KEY值,将DataFrame连接(join) #pd.concat 沿着一个轴拼接 #combine_first 如果有缺失值,另外要给数据集对其进行填充
pd.merge(df1,df2).head()
pd.merge(df1,df2,how="outer").head()#how默认为 inner 可修改为 outer left right
#索引合并 pd.merge(df2,df1,left_index=True,right_index=True).head()
import pandas as pd df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,16,17,15],"最低气温":[7,8,8,9,12,3,5],"天气":["晴","多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风","东北风","东北风","西北风","西北风","北风","南风"],"风力":[2,2,2,1,2,3,2]})
df=df.reindex(["日期"]+["最高气温"]+["最低气温"]+["天气"]+["风向"]+["风力"],axis=1) df.head()
df.stack()#列转化为层级的Series
df.stack().unstack()#还原
pd.pivot_table() data, 数据集 values=None, 值是谁 index=None, 索引是谁 columns=None, 标题是谁 aggfunc=‘mean‘, 聚合的函数是谁 fill_value=None, margins=False, dropna=True, 是否召回 margins_name=‘All‘
df_table=pd.pivot_table(df,index=["天气"],columns=["风向"],values=["最高气温"]) df_table
df_table.info()
import numpy as np
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,np.nan,17,15],"最低气温":[7,8,8,np.nan,12,3,5],"天气":[np.nan,"多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风",np.nan,"东北风","西北风",np.nan,"北风","南风"],"风力":[2,2,np.nan,1,2,3,2]})
df
df.isnull()#发现缺失值 True 为有缺失
df.notnull()#发现缺失值 False 为有缺失
df.dropna(axis=0) 删除有缺失值的行
df.dropna(axis=1)#删除有缺失值的列
#用字符串填充 df.fillna("missing")
#使用前一个数值代替 df.fillna(method="pad")
df.fillna(method="pad",limit=1)#只向下或向上填充一个,填充过多数据不准
#向后填充 df.fillna(method="bfill")
#用均值填充 df.fillna(df.mean())
df.fillna(df.mean()["最低气温":"最高气温"])#只填充需要填充的行数
df.loc[:,"最低气温":"最高气温"].fillna(df.mean())
#参照正太分布 定义》3标准差或小于-3标准差的值为异常值 sta=(df["最高气温"]-df["最高气温"].mean())/df["最高气温"].std() sta.abs()>1
df["最高温度是否异常"]=sta.abs()>1 df
df["最高温度是否异常"].value_counts()
#用箱线图定义异常值
h=df["最高气温"] iqr=h.quantile(0.75)-h.quantile(0.25) df_max=h.quantile(0.75)+1.5*iqr df_min=h.quantile(0.25)-1.5*iqr
df_max
df_min
df["isouter"]=(h>df_max)|(h<df_min)
df
df.duplicated()
df.duplicated("风力")
d2=df.drop_duplicates("风力")#删除有重复项的行
d2
import time
time.time()#时间戳是指格林威治时间自1970年1月1日(00:00:00 gmy)至当前时间的总秒数 #北京时间1970年1月1日(08:00:00)
time.localtime()
time.strftime()#format 时间格式 %Y Year with century as a decimal number. %m Month as a decimal number [01,12]. %d Day of the month as a decimal number [01,31]. %H Hour (24-hour clock) as a decimal number [00,23]. %M Minute as a decimal number [00,59]. %S Second as a decimal number [00,61]. %z Time zone offset from UTC. %a Locale‘s abbreviated weekday name. %A Locale‘s full weekday name. %b Locale‘s abbreviated month name. %B Locale‘s full month name. %c Locale‘s appropriate date and time representation. %I Hour (12-hour clock) as a decimal number [01,12]. %p Locale‘s equivalent of either AM or PM.
time.strftime("%Y-%m-%d",time.localtime())#把当前时间转换成可读形式,注意转换之后为str格式
s=time.strftime("%Y-%m-%d",time.localtime())
type(s)
d=time.strptime(s,"%Y-%m-%d")#返回datetime格式的时间
type(d)
time.localtime(1533785557.0)
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(1533785557.0))
import datetime import pandas as pd import numpy as np
#取当前datetime格式的时间 datetime.datetime(2018,8,8)
pd.date_range() start=None, 开始 end=None, 结束 periods=None, 生成多少个 freq=‘D‘, 默认按天计算 tz=None, normalize=False, name=None, closed=None, **kwargs)
#生成时间序列数据 pd.date_range(datetime.datetime(2018,8,8),periods=4)
pd.date_range("2018-8,-8",periods=4)#指定生成个数
pd.date_range("2018-8-8","2018-9-9")#指定起始日期
pd.date_range("2018-8-8 11:00","2018-8-9 00:00",freq="H")#按小时生成序列
ser=pd.Series(np.arange(10),index=pd.date_range("2018-8-9",periods=10)) ser
ser["2018-8-9"]
ser.index[2].year
ser.index[2].month
ser.index[2].day
pd.to_datetime()
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,np.nan,17,15],"最低气温":[7,8,8,np.nan,12,3,5],"天气":[np.nan,"多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风",np.nan,"东北风","西北风",np.nan,"北风","南风"],"风力":[2,2,np.nan,1,2,3,2]})
df.info()
df["日期"]=pd.to_datetime(df["日期"].values,format="%Y-%m-%d")
df.info()
df
#将日期设置为索引 df=df.set_index("日期")
df
df_join=df[(df.index>="2017-01-01")&(df.index<="2017-02-01")]#注意时间输入需与索引格式一致 df_join
df["2017-01-01":"2017-01-31"].info()
#转换成月份 df.to_period("M")
data=pd.DataFrame({"Rank":[1,2,3,4,5],"city":["london","benrlin]","madind","rome","pans"],"state":[" kingdom"," gemany","spain ","ltaly","frnce"], "popuiation":["8,615,246","3,437,916","3,165,235","2,872,086","2,273,305"],"dateofcensusestumate":["1 june 2014","31 may 2014", "1 january 2014","30 september 2014","1 jannany 2013"]})
data
date=data.reindex(["Rank"]+["city"]+["state"]+["popuiation"]+["dateofcensusestumate"],axis=1)#排序
date
date.info()
date["popuiation"].apply(lambda x :x.split(","))#按照逗号分隔
date["popuiation"].apply(lambda x :x.replace(",",""))#把逗号替代为空 #lambda 匿名函数 #apply 循环
subtr=date["popuiation"].apply(lambda x : int(x.replace(",","")))
date["numericpopuiation"]=subtr date
date["state"].values# 发现数据有空格
date["state"].apply(lambda x :x.strip())#剔除前后空格
stri=date["state"].apply(lambda x :x.strip())#空格没有了 date["stace"]=stri date["stace"].values
str_1=pd.DataFrame({"title":["网名最喜欢的旅游目的地榜单出炉","让生活更幸福是旅游业的使命","一带一路国家中东欧游客增两倍","旅游业改革开启旅游强国新篇章"], "link":["http//cntour.cninews/4221/","http//cntour.cninews/4212/","http//cntour.cninews/4202/","http//cntour.cninews/4191/"]})
str_1
注:括号里面是需要的内容
str_1["link"]
str_1["link"].str.extract("ews/(.+)/",expand=False)
str_2=str_1["link"].str.extract("ews/(.+)/",expand=False) str_1["links"]=str_2 str_1
dic={"4221":"过","4212":"来","4202":"玩","4191":"啊"}
%%time str_1["linkss"]=str_1["links"].map(dic)#map 映射函数,可将dic的值 根据键一一对应,映射到str——1 str_1 Wall time: 3 ms
原文:https://www.cnblogs.com/Koi504330/p/11902191.html