fo = open(‘novel.txt‘,‘r‘,encoding=‘utf-8‘) #读取文件 str = fo.read() fo.close() print(str) str = str.lower() #全部转为小写 sep = ‘.,:;?!‘ #删除特殊字符 for a in sep: str = str.replace(a,‘ ‘) print(str) strlist = str.split() #分割字符 print(len(strlist),strlist) strset = set(strlist) #将字符转为列表 print(len(strset),strset) se = {‘a‘,‘the‘,‘and‘,‘we‘,‘you‘,‘of‘,‘si‘,‘s‘,‘ter‘,‘to‘} #删除无语义词 strsete =strset-se print(strsete) strdict = {} #单词计数字典 for word in strset: strdict[word] = strlist.count(word) print(len(strdict),strdict) for word in strset: #单词计数集合 strdict[word] = strlist.count(word) print(len(strdict),strdict) wordlist = list(strdict.items()) wordlist.sort(key=lambda x:x[1],reverse=True) #用lambda函数排序 print(strlist) for i in range(20): #输出TOP(20) print(wordlist[i])
原文:https://www.cnblogs.com/hodafu/p/9722203.html