# 2018年9月27日完整的英文词频统计 # strYee = ‘‘‘‘‘‘.lower() # 准备utf-8编码的文本文件file fo = open(‘Nothingtolose.txt‘,‘r‘,encoding=‘utf-8‘) # 通过文本读取字符串str strYee = fo.read() fo.close() print(strYee) # 对文本进行预处理 # 字符串预处理 # 大小写 # 标点符号 # 特殊符号 # 分解提取单词list sep = ‘‘‘.,;:?!-_‘‘‘ for ch in sep: strYee = strYee.replace(ch,‘ ‘) strList = strYee.split() print(len(strList),strList) # 单词计数字典set,dict # 排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strList) exclude = {‘a‘,‘the‘,‘and‘,‘i‘,‘you‘,‘in‘} strSet = strSet-exclude print(len(strSet),strSet) strDic = {} for word in strSet: strDic[word] = strList.count(word) print(len(strDic),strDic) # 按词频排序list。sort(key=) wcList = list(strDic.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) # print(strDic.items()) # 输出TOP(20) for i in range(20): print(wcList[i])
import jieba fo = open(‘我们的少年时代.txt‘,‘r‘,encoding=‘utf-8‘) strYee = fo.read() fo.close() print(strYee) print(list(jieba.cut(strYee))) print(list(jieba.cut(strYee,cut_all=True))) print(list(jieba.cut_for_search(strYee)))
原文:https://www.cnblogs.com/GZCC-11-28/p/9712278.html