层次短语模型是David Chiang在短语模型基础之上提出来的模型,该模型属于形式化句法翻译模型。将普通短语模型拓展成了层次化的短语。例如“X1 和 X2”。
本文着重讲述层次短语模型的短语规则抽取模块,也就是如何从双语句对的训练集中抽取去“短语表”。
我们可以参看如下图,系统的总体框架图:
将系统中出现的物体都抽象成对象。大家可以通过命名就知道此点。
下面我们将对重要的子过程进行讲述:
1、LexTranslator词到词的翻译
底层的数据结构如下:
typedef map<Word, Float> LexTableItem; typedef map<Word, map<Word, Float> > LexTable; LexTable f2e_table_; // prob(0.0-1.0) from f to e LexTable e2f_table_;
2、对齐一致性的抽取
对齐一致性的抽取采用了前缀数组,通过检测数组的相应范围变化是否一致,得出是否为对齐一致性。
void Alignment::CreateTightConsistSpan(int src_limit, int trg_limit) { //count the size of alignment of prefix vector<int> src_count, trg_count; src_count.resize(m_src_size, 0); trg_count.resize(m_trg_size, 0); for (size_t i = 0; i < (size_t)m_src_size; i++) { for (size_t j = 0; j < m_wa[i].size(); j++) { src_count[i]++; trg_count[m_wa[i][j]]++; }//end for j }//end for i for (size_t i = 1; i < src_count.size(); i++) src_count[i] += src_count[i - 1]; for (size_t i = 1; i < trg_count.size(); i++) trg_count[i] += trg_count[i - 1]; Alignment::Span trg; for (int begin = 0; begin < m_src_size; begin++) { trg.first = MAX_INT; trg.second = MIN_INT; for (int dist = 1; dist <= src_limit && dist + begin - 1 < m_src_size; dist++) { int end = begin + dist - 1; for (size_t i = 0; i < m_wa[end].size(); i++) { if (trg.first > m_wa[end][i]) trg.first = m_wa[end][i]; if (trg.second < m_wa[end][i]) trg.second = m_wa[end][i]; } if (trg.first > trg.second) //null alignment continue; if (trg.second - trg.first + 1 > trg_limit) continue; int f = src_count[end]; if (begin != 0) f -= src_count[begin - 1]; f -= trg_count[trg.second]; if (trg.first != 0) f += trg_count[trg.first - 1]; if (f == 0) //consistent to align { //tight consist, boundary words must have alignments if (m_wa[begin].size() != 0 && m_wa[end].size() != 0) m_consist_spans[Alignment::Span(begin, end)] = trg; } } } }
3、Extractor的抽取规则模块讲解
void Extractor::Extract(const string& src_file, const string& trg_file, const string& wa_file) { ifstream in_src, in_trg, in_wa; ReadFile(src_file, in_src); ReadFile(trg_file, in_trg); ReadFile(wa_file, in_wa); Log::Instance().Out() << "Starting to extract rule!" << endl; Log::Instance().TimeStart(); map<string, Rule *> sent_rules;//store the rules extracted from a sentence map<string, Rule *> rule_map; //cache for store extracted but not yet output file string src, trg, wa; int part_file_id = 0; int sent_id = 0; int rule_count = 0; while (getline(in_src, src) && getline(in_trg, trg) && getline(in_wa, wa)) { sent_id ++; SentPair sent; sent.SetSentId(sent_id - 1); if (sent.Init(src, trg, wa)) sent.ExtractRules(sent_rules); else continue; rule_count += sent_rules.size(); LocalCombine(sent_rules, rule_map); if ((int) rule_map.size() > StaticData::Instance().Capacity()) { OutCache(m_part_file, part_file_id, e2f, rule_map); part_file_id++; } if (sent_id % 10000 == 0) { Log::Instance().Out() << "cur sent_id:" << sent_id <<endl;; } } OutCache(m_part_file, part_file_id, e2f, rule_map); in_src.close(); in_trg.close(); in_wa.close(); Log::Instance().Out() << "end extracted rule in time (s):" << Log::Instance().TimeEnd() << endl; }
4、规则概率估算
1)合并所有的临时文件->一个e2f的文件A
2)对A进行排序
3)计算f2e的概率,并且生成f2e文件B
4)对B进行排序
5)计算e2f的概率,并且生成最终规则文件
5、抽取一个句对中所有的规则
void SentPair::ExtractRules(std::map<string, Rule *>& rule_map) { SentenceMeta sm; sm.sent_id_ = this->sent_id_; sm.src_ = &src_; sm.trg_ = &trg_; StaticData::Instance().GetFeatureSet().Prepare(sm); // use cky-style algorithm to find all consistent rule for (int dist = 1; dist <= StaticData::Instance().SrcSpanLimit(); dist++) { for (size_t begin = 0; begin + dist - 1 < src_.size(); begin++) { pair<int,int> span; span.first = begin; span.second = begin + dist - 1; if (Log::Instance().IsVerbose(3)) { Log::Instance().Out() << "\n deal span (" << span.first << ", " << span.second << ")" <<endl; } GetRule(span, rule_map); } //end begin } //end dist map<string, Rule *>::const_iterator citer; for (citer = rule_map.begin(); citer != rule_map.end(); citer++) StaticData::Instance().GetFeatureSet().Final(sm, *citer->second); }
void SentPair::GetRule(const pair<int,int>& span, map<string ,Rule *>& rule_map) { // current span must be consist Alignment::SpanAlign::const_iterator citer; const Alignment::SpanAlign& cs = wa_->GetConsistSpans(); map<string, Rule *>::iterator iter; citer = cs.find(span); if (citer == cs.end()) return; // TODO support extract boundary expansion // full lexical rule trg_span shall be small than limit SentenceMeta sm; sm.sent_id_ = this->sent_id_; sm.src_ = &src_; sm.trg_ = &trg_; Context context; context.src_span_ = span; context.trg_span_ = citer->second; //extract bp if (span.second - span.first + 1 <= StaticData::Instance().InitPhraseLimit()) { vector<pair<int,int> > empty; Rule * rule = new Rule(); CreateSrcTrg(span, empty, citer->second, empty, rule->src_rhs_, rule->trg_rhs_, rule->wa_); StaticData::Instance().GetFeatureSet().Traverse(sm, context, 1.0, *rule); //cout << "rule->fract_count_: " << rule->fract_count_ << endl; iter = rule_map.find(rule->Key()); if (iter == rule_map.end()) { rule_map[rule->Key()] = rule; } else { iter->second ->Add(*rule); delete rule; } } //extract rules with variable vector<vector<pair<int,int> > > var_span; EnumerateVar(span, var_span); vector<pair<int,int> > trg_childs_span; for (size_t i = 0; i < (int)var_span.size(); i++) { trg_childs_span.resize(var_span[i].size()); for (size_t j = 0; j < var_span[i].size(); j++) trg_childs_span[j] = cs.find(var_span[i][j])->second; Rule *rule = new Rule(); CreateSrcTrg(span, var_span[i], citer->second, trg_childs_span, rule->src_rhs_, rule->trg_rhs_, rule->wa_); //cout << "rule->fract_count_: " << rule->fract_count_ << endl; //if (rule->m_wa.size() == var_span[i].size()) {//must have lexical alignment if (rule->AlignLinkCount() == var_span[i].size()) //must have lexical alignment { delete rule; continue; } context.src_var_spans_ = var_span[i]; context.trg_var_spans_ = trg_childs_span; StaticData::Instance().GetFeatureSet().Traverse(sm, context, (Float) 1.0/var_span.size(), *rule); iter = rule_map.find(rule->Key()); if (iter == rule_map.end()) { rule_map[rule->Key()] = rule; } else { iter->second->Add(*rule); delete rule; } } }
原文:http://blog.csdn.net/ict2014/article/details/24252815