首页 > 其他 > 详细

一些好用的代码

时间:2019-09-18 20:47:19      阅读:90      评论:0      收藏:0      [点我收藏+]
##### 正则化 ####

def
re_fun(seq): rule = re.compile(u[^a-zA-Z.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|\s:: + \u4e00-\u9fa5]+) seq = re.sub(rule, ‘‘, seq) seq = re.sub([<]+, ‘‘, seq) seq = re.sub([>]+, ‘‘, seq) seq = re.sub([:]+, ‘‘, seq) seq = re.sub([;]+, ‘‘, seq) seq = re.sub([!]+, ‘‘, seq) seq = re.sub([.]+, ‘‘, seq) seq = re.sub([。]+, ‘‘, seq) seq = re.sub([,]+, ‘‘, seq) seq = re.sub([\n]+, ‘‘, seq) return seq

#############################################################################################################################################

##### 生成词表 #####
def
vocab_fun(filename): vocab = ct.Counter() with codecs.getreader(utf-8)(tf.gfile.GFile(filename,rb)) as file: for line in file.readlines(): line = re_fun(line) line = line.strip().split( ) #print(line) for word in line: #print(word) vocab.update([word]) return vocab

##########################################################################################################################################

##### 写入文件路径 #####
dir_path = "D:\mathine_learning\pre_esti\dataset" tgt = europarl-v7.de-en.de src = europarl-v7.de-en.en train_src = os.path.join(dir_path,os.path.basename(src)) train_tgt = os.path.join(dir_path,os.path.basename(tgt))

#########################################################################################################################################

##### 测bleu值 #####
from
nltk.translate.bleu_score import corpus_bleu src_seq = open(src_file,r,encoding=utf-8) tgt_seq = open(tgt_file,r,encoding=utf-8) temp1 = [] temp2 = [] for line1,line2 in zip(src_seq,tgt_seq): line1 = line1.strip(\n).split( ) line2 = line2.strip(\n).split( ) temp1.append(line1) temp2.append(line2) a = corpus_bleu(temp1,temp2) print( a ) src_seq.close() tgt_seq.close()

 

一些好用的代码

原文:https://www.cnblogs.com/hanouba/p/11544867.html

(1)
(1)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!