import json import pandas as pd import numpy as np import matplotlib.pyplot as plt if __name__=="__main__": path="usagov_bitly_data2012-03-16-1331923249.txt" fp=open(path) records=[json.loads(line) for line in fp.readlines()] print(len(records)) frame=pd.DataFrame(records) print(frame[‘tz‘]) clean_tz=frame[‘tz‘].fillna(‘Missing‘) clean_tz[clean_tz==‘‘]=‘Unknown‘ tz_counts=clean_tz.value_counts() print(tz_counts[:20]) #tz_counts[:10].plot(kind=‘barh‘,rot=0) #plt.show() results=pd.Series([x.split()[0] for x in frame.a.dropna()]) print(results[:5]) cframe=frame[frame.a.notnull()] operating_system=np.where(cframe[‘a‘].str.contains(‘Windows‘),‘Windows‘,‘not Windows‘) print(operating_system[:10]) by_tz_os=cframe.groupby([‘tz‘,operating_system]) agg_counts=by_tz_os.size().unstack().fillna(0) print(agg_counts[:10]) indexer=agg_counts.sum(1).argsort() print(indexer[:10]) count_subset=agg_counts.take(indexer)[-10:] print(count_subset) #count_subset.plot(kind=‘barh‘,stacked=True) normed_subset=count_subset.div(count_subset.sum(1),axis=0) normed_subset.plot(kind=‘barh‘,stacked=True) plt.show()
原文:http://www.cnblogs.com/sklww/p/3655246.html