首页 > 其他 > 详细

利用同义词林计算词的相似度——基于路径与深度的同义词词林词语相似度计算

时间:2019-12-31 09:46:10      阅读:97      评论:0      收藏:0      [点我收藏+]

我主要根据《中文信息学报》2016年9?第30卷第5期中的《基于路径与深度的同义词林词语相似度计算》提及到的计算公式进行了代码实现,并根据其最终的结果对原设计进行了微调。

import os
import csv
import re
class Node(object):
    def __init__(self,id,children,parent,words):
        self.id = id.copy()
        self.children = []
        self.parent = None
        self.words = words.copy()
        self.level = len(id) if len(id) < 5 else 5
    def append_node(self,child):
        self.children.append(child)
        child.parent = self
        return

def normalize(cilin):
    cilin_to_list = cilin.split()
    id_list = re.findall(r"\D|\d\d",cilin_to_list[0].strip())
    word_list = cilin_to_list.copy()
    del word_list[0]
    new_node = Node(id_list,None,None,word_list)
    return new_node


def findparent(Root,sub_node,depth):
    if len(Root.children) == 0:
        return Root #if it have no kids,then it will have a new kids
    else:
        for child in Root.children: #else check every kids, the true parent must be among them
            if(sub_node.id[depth]==child.id[depth]):
                return findparent(child,sub_node,depth+1)
    return Root

def add2dict(word_dict,node):
    for word in node.words:
        if word in word_dict:
            word_dict[word].append(node)
        else:
            word_dict[word] = []
            word_dict[word].append(node)
    return

def AnalyzeCilin(filename):
    cilin_node_list = []
    word_dict = {}
    with open(filename,"r",encoding="gbk") as f:
        cilinlist = f.readlines()
    for cilin in cilinlist:
        cilin_node_list.append(normalize(cilin))
    Root = Node(['Root'],None,None,['Root'])
    for cilin_node in cilin_node_list:
        temp = findparent(Root,cilin_node,0)
        if(None == temp):
            print("error")
            print(cilin_node.id)
            return None,None
        temp.append_node(cilin_node)
        if(cilin_node.level == 5):
            add2dict(word_dict,cilin_node)
    return Root,word_dict

def GenerateParents(word_node,Root):
    parents_list = []
    sub_node = word_node
    while not sub_node == Root:
        parents_list.append(sub_node)
        sub_node = sub_node.parent
    parents_list.append(Root)
    return parents_list

def find_sub_tree(word1,word2,Root,word_dict):
    maxdepth = 0
    tar_node = Root
    sub_word1 = Root
    sub_word2 = Root
    K = 0
    for word1_node in word_dict[word1]:
        for word2_node in word_dict[word2]:
            word1_parents = GenerateParents(word1_node,Root)
            word2_parents = GenerateParents(word2_node,Root)
            for i in range(6):
                if(word1_parents[i] == word2_parents[i]):
                    if(maxdepth < word1_parents[i].level):
                        maxdepth = word1_parents[i].level
                        tar_node = word1_parents[i]
                        sub_word1 = word1_node
                        sub_word2 = word2_node
    return tar_node,sub_word1,sub_word2

def CalculateK(sub_root,word1_node,word2_node):
    for i in range(word1_node.level if word1_node.level < word2_node.level else word2_node.level):
        if not (word1_node.id[i] == word2_node.id[i]):
            if (word1_node.id[i].isalpha()):
                return (abs(ord(word1_node.id[i]) - ord(word2_node.id[i])))
            elif(word1_node.id[i].isdigit()):
                return (abs(int(word1_node.id[i]) - int(word2_node.id[i])))
            else:
                print("Error")
                return 0
    return 0


def Sim(word1,word2,Root,word_dict):
    weight = [8,6,4,1.5,0.5]
    sub_root,word1_node,word2_node = find_sub_tree(word1,word2,Root,word_dict)
    K = CalculateK(sub_root,word1_node,word2_node)
    if(sub_root == word1_node and sub_root == word2_node):
        if(sub_root.id[-1] == '#'):
            return 0.98
        elif(sub_root.id[-1] == '='):
            return 1
    Comm = 0.9
    for i in range(sub_root.level):
        Comm = Comm + weight[i]
    Diff = (float(weight[sub_root.level]) * K) / len(sub_root.children)
    for i in range(sub_root.level,word1_node.level):
        Diff = Diff + weight[i]
    for i in range(sub_root.level,word2_node.level):
        Diff = Diff + weight[i]
    print(word1,word2,"Comm=",Comm,"Diff",Diff)
    return Comm / (Comm + Diff)

def SimCalculate(Root,filename,word_dict):
    test_word = []
    with open(filename,'r',encoding="gbk") as csvfile:
        csv_reader = csv.reader(csvfile)
        for row in csv_reader:
            test_word.append(row)
    for test in test_word:
        if(test[0] in word_dict and test[1] in word_dict):
            Similarity = Sim(test[0],test[1],Root,word_dict)
            test.append(Similarity)
        else:
            test.append("不在库中")
        print(test[0],test[1],test[-1])
    return test_word

def Save2File(SimResult,filename):
    headers = ['Word1','Word2','Similarity']
    with open(filename,'w',encoding="gbk") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(headers)
        csv_writer.writerows(SimResult)
    return

if __name__ == '__main__':
    Root,word_dict = AnalyzeCilin("cilin_扩展版.txt")
    SimResult = SimCalculate(Root,"测试用例.csv",word_dict)
    Save2File(SimResult,"测试用例的相似度(最终版).csv")

未完待续,代码的进一步说明待到考完期末考试再补。

利用同义词林计算词的相似度——基于路径与深度的同义词词林词语相似度计算

原文:https://www.cnblogs.com/ecnu/p/12122511.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!