#1.
theFile = open(‘the.txt‘,mode=‘r‘,encoding=‘utf-8‘)
theText = theFile.read() # 从文件里读出全部文本,字符串
theFile.close()
print(theText)
#2.
replaceList = [‘,‘,‘.‘,"‘",‘\n‘]
for c in replaceList:
theText = theText.replace(c,‘ ‘) # 替换掉所以标点符号
theText = theText.replace(‘ ‘,‘ ‘)
print(theText)
#3.
print(theText.split(‘ ‘))
theList = theText.split(‘ ‘) #列表 出现的单词序列
#4.
theSet = set(theList) # 集合 有哪些单词
print(theSet)
theDict = {} # 字典:每个单词的词频统计次数
for word in theSet:
theDict[word] = theList.count(word)
print(theDict)
#5.排序
wordCountList = list(theDict.items()) #字典没有顺序,不能排序,转换成列表进行排序
print(wordCountList)
wordCountList.sort(key=lambda x:x[1],reverse=True) # 进行排序
print(wordCountList)
‘‘‘
#6.输出top20
for i in range(20):
print(wordCountList[i])
原文:https://www.cnblogs.com/biubiuojbk/p/9206234.html