import re import jieba doc1 = ‘‘‘曝甜瓜和篮网球员一起训练 新赛季有望加入篮网?:自从上赛季在火箭经历了失败后,安东尼一直无人问津,今年夏天关于他打不上球的话题也是被反复议论,根据最新消息,安东尼最近正和篮网球员一起训练。 ‘‘‘ doc2 = ‘‘‘周琦团队:感谢新疆耐心等待 新疆仍支持周琦留洋:昨天,男篮国手周琦重返新疆男篮的事尘埃落定。此后,周琦团队很快发表声明,新疆男篮仍会支持周琦挑战高水平联赛,而周琦之所以没有最终到欧洲去打球,是因为兼顾国家队备战,无法抽身去参加试训。 ‘‘‘ doc3 = ‘‘‘官宣!新疆宣布周琦回归 新赛季向总冠军发起冲击:北京时间8月13日,新疆篮球俱乐部官宣已经与周琦完成签约。周琦将重新身披新疆队战袍,征战CBA赛场。新赛季周琦与球队向总冠军发起冲击。 ‘‘‘ doc4 = ‘‘‘官宣!新疆宣布周琦回归 新赛季向总冠军发起冲击‘‘‘ doc5 = ‘‘‘欧联-武磊替补登场 西班牙人2-2总比分5-3进正赛_手机搜狐网,阵容方面,西班牙人主帅加耶戈排出4-3-3阵型,卡莱罗搭档路易斯-洛佩斯出任中卫,达德尔、罗卡、格拉内罗组成‘‘‘ doc_complete = [doc1, doc2, doc3] # doc_clean = [clean(doc).split() for doc in doc_complete] def fenci(title): corpus = ‘‘ corpus = title stopwords = [line.strip() for line in open(‘../data/stopWord.txt‘, ‘r‘, encoding=‘utf-8‘).readlines()] r4 = "\\【.*?】+|\\《.*?》+|\\#.*?#+|[0-9]+|[.!/_,$&%^*()<>+""‘?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】]" BIFEN_RE = re.compile(r‘[0-9]+[:|:|-|-|+]\s{0,5}[0-9]+‘) doc = re.sub(BIFEN_RE, ‘比分‘, corpus) doc = re.sub(r‘VS‘, ‘对战‘, doc, flags=re.IGNORECASE) doc = re.sub(r‘8强‘, ‘八强‘, doc) doc = re.sub(r‘4强‘, ‘四强‘, doc) doc = re.sub(r‘16强‘, ‘十六强‘, doc) doc = re.sub(r4, ‘ ‘, doc) jieba.load_userdict("../data/user_dict.txt") seg = jieba.cut(doc) result = ‘‘ for word in seg: if word not in stopwords: result += word + ‘ ‘ return result doc_clean = [fenci(doc).split() for doc in doc_complete] # print(doc_clean) # LDA import gensim from gensim import corpora # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] Lda = gensim.models.ldamodel.LdaModel # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) print(ldamodel.print_topics(num_topics=3, num_words=5)) # DF for doc in doc_clean: # dic = dict(zip(doc, [(i-i) for i in range(len(doc))])) # print(dic) dic = {} for word in doc: if dic. __contains__(word): dic[word] += 1 else: dic[word] = 1 print(dic) l = sorted(dic.items(), key=lambda d:d[1],reverse=True) print(l) import jieba import jieba.analyse keywords = jieba.analyse.extract_tags(doc3, topK=5, withWeight=True, allowPOS=(‘n‘, ‘nr‘, ‘ns‘)) # print(type(keywords)) # <class ‘list‘> for item in keywords: print(item[0], item[1]) print(‘****************‘) keywords = jieba.analyse.textrank(doc3, topK=5, withWeight=True, allowPOS=(‘n‘, ‘nr‘, ‘ns‘)) # type(keywords) # <class ‘list‘> for item in keywords: print(item[0], item[1])
from gensim.models import word2vec import jieba import re import numpy as np # 利用word2vec计算 model = word2vec.Word2Vec.load(‘../model/last.model‘) def fenci(title): corpus = ‘‘ corpus = title r4 = "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""‘?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】]" BIFEN_RE = re.compile(r‘[0-9]+[:|:|-|-|+]\s{0,5}[0-9]+‘) doc = re.sub(r4, ‘ ‘, corpus) doc = re.sub(BIFEN_RE, ‘比分‘, doc) doc = re.sub(r‘VS‘, ‘对战‘, doc, flags=re.IGNORECASE) doc = re.sub(r‘8强‘, ‘八强‘, doc) doc = re.sub(r‘4强‘, ‘四强‘, doc) doc = re.sub(r‘16强‘, ‘十六强‘, doc) jieba.load_userdict("../data/user_dict.txt") seg = jieba.cut(doc) result = ‘‘ for word in seg: result += word + ‘ ‘ return result sent1 = u‘欧联-武磊替补登场 西班牙人2-2总比分5-3进正赛_手机搜狐网‘ sent2 = u‘武磊替补登场‘ def getsentecevec(sentence): vec = np.zeros(100) # print(vec) count = 0 model = word2vec.Word2Vec.load(‘../model/last.model‘) for wor in fenci(sentence): try: vec += model.wv[wor] count += 1 except: # print(wor + ‘not in voca‘) pass vec /= count return vec def xiangsidu(vec_a,vec_b): a = np.array(vec_a) b = np.array(vec_b) ret = np.sum(a * b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2))) print(‘分数:‘ + str(ret)) doc1 = u‘欧联-武磊替补登场 西班牙人2-2总比分5-3进正赛_手机搜狐网‘ doc2 = u‘西媒盛赞武磊:没有让人失望!跑位很出色 踢法灵活本报讯(记者王帆)‘ doc3 = u‘西班牙人前瞻:武磊有望重回首发 伤员悉数归队_手机搜狐网‘ doc4 = u‘表现平平!武磊仅获西媒评5分 0射门进攻端无贡献_手机搜狐网‘ doc = u‘武磊替补登场‘ str5 = "篮球贡献获世界肯定 姚明将入选NBA名人堂 晚间体育新闻" str6 = "姚明已确定入选2016届名人堂 午间体育新闻" str7 = "篮球贡献获世界肯定 姚明将入选NBA名人堂" str8 = "美媒:篮球明星姚明将入选美国NBA名人堂" str9 = "姚明已确定入选2016届名人堂" str1 = "[冠军欧洲]特别企划:狭路相逢 球队交锋史" str2 = "[冠军欧洲]特别企划:逆转之态 欧冠小百科" str3 = "[冠军欧洲]特别企划:逆转之态 姚明小百科" doc_c = [str5, str6, str7, str8, str2, doc1,doc2,str3,doc4] vec_a = getsentecevec(str9) for d in doc_c: vec_b = getsentecevec(d) xiangsidu(vec_a, vec_b) # sent1 = u‘军训服蹲下崩线‘ # sent2 = u‘军训服蹲下就崩线‘ # sent1 = u‘吴亦凡女友身份 ‘ # sent2 = u‘吴亦凡女友疑曝光‘
原文:https://www.cnblogs.com/meikon/p/11494689.html