import xml.dom.minidom from pandas import Series # 存储句子 data_list = [] # 存储句子标注极性 label_list = [] if __name__ == ‘__main__‘: # 获取xml文件 document_tree = xml.dom.minidom.parse(‘../data/SMP2019_ECISA_Train.xml‘) # 获取文件中的元素 collection = document_tree.documentElement # 打印文件 # print(collection.toxml()) # 获取xml文件中子标签内容 Doc_node = collection.getElementsByTagName("Doc") for i in range(len(Doc_node)): sentence_node = Doc_node[i].getElementsByTagName("Sentence") for j in range(len(sentence_node)): # 剔除没有标注的数据 if sentence_node[j].getAttribute("label") == "0" or sentence_node[j].getAttribute("label") == "1" or sentence_node[j].getAttribute("label") == "2": # 获取句子数据 sentence = sentence_node[j].firstChild.data # 获取句子标注的极性 label = sentence_node[j].getAttribute("label") data_list.append(sentence) label_list.append(label) # 打印数据 with open("trainSMP.txt", "w") as f: for l in range(len(data_list)): f.write("__label__"+label_list[l]+" "+data_list[l]+"\n") # print(data_list[l]+"\t"+label_list[l]) # print(‘\n‘)
把原来这个样子
变成这个样子(扔进fasttext了)
参考资料:
情感分析xml数据集利用Python将xml文件中的句子提取出来
https://blog.csdn.net/qq_35386727/article/details/96364615
其他方法:
xml格式解析方法
https://blog.csdn.net/renyuanfang/article/details/90440465
原文:https://www.cnblogs.com/jie-74/p/13830186.html