void CRnnLM::computeNet(int last_word, int word)
{
//last_word表示当前输入层所在的词
//word表示要预测的词
int a, b, c;
real val;
double sum; //sum is used for normalization: it's better to have larger precision as many numbers are summed together here
//将last_word对应的神经元ac值为1,也可以看做是对该词的1-of-V的编码
if (last_word!=-1) neu0[last_word].ac=1;
//下面计算输入到隐层的部分
for (a=0; a<layer1_size; a++) neu1[a].ac=0;
for (a=0; a<layerc_size; a++) neuc[a].ac=0;
//这里计算的是s(t-1)与syn0的乘积
matrixXvector(neu1, neu0, syn0, layer0_size, 0, layer1_size, layer0_size-layer1_size, layer0_size, 0);
//这里计算将last_word编码后的向量(大小是vocab_size,分量只有一个为1,其余为0)与syn0的乘积
for (b=0; b<layer1_size; b++) {
a=last_word;
if (a!=-1) neu1[b].ac += neu0[a].ac * syn0[a+b*layer0_size].weight;
}
//这里计算将上面隐层所得到的输入(ac值)经过sigmoid函数的映射结果
for (a=0; a<layer1_size; a++) {
//为数值稳定,将ac值大小限制在[-50,50]
//论文中有提到模型的参数小一些泛化的结果好一些
if (neu1[a].ac>50) neu1[a].ac=50;
if (neu1[a].ac<-50) neu1[a].ac=-50;
val=-neu1[a].ac;
//fasterexp函数在fasexp.h中实现,应该比math.h中的exp快吧
neu1[a].ac=1/(1+fasterexp(val)); //sigmoid函数即1/(1+e^(-x))
}
if (layerc_size>0) {
//计算隐层到压缩层的结果
matrixXvector(neuc, neu1, syn1, layer1_size, 0, layerc_size, 0, layer1_size, 0);
//和上面类似,这里计算的是压缩层
for (a=0; a<layerc_size; a++) {
if (neuc[a].ac>50) neuc[a].ac=50; //for numerical stability
if (neuc[a].ac<-50) neuc[a].ac=-50; //for numerical stability
val=-neuc[a].ac;
neuc[a].ac=1/(1+fasterexp(val));
}
}
//1->2 class
//输出层class_size部分ac值置0
for (b=vocab_size; b<layer2_size; b++) neu2[b].ac=0;
//计算压缩层到class层(输出层的一部分)
if (layerc_size>0) {
matrixXvector(neu2, neuc, sync, layerc_size, vocab_size, layer2_size, 0, layerc_size, 0);
}
else
{
//无压缩层,直接计算隐层到输出层
matrixXvector(neu2, neu1, syn1, layer1_size, vocab_size, layer2_size, 0, layer1_size, 0);
}//最大熵模型n元特征计算
if (direct_size>0) {
//注意这是hash定义在if内的,也就是出了if外面就无法访问了
//下面会看到每次都单独定义了局部的hash
//hash[i]里面存放的是i+1元模型的特征在syn_d中对应的下标
unsigned long long hash[MAX_NGRAM_ORDER]; //this will hold pointers to syn_d that contains hash parameters
for (a=0; a<direct_order; a++) hash[a]=0;
//下面就是将n元特征单独映射为一个值,这里的权值是针对class部分的
for (a=0; a<direct_order; a++) {
b=0;
if (a>0) if (history[a-1]==-1) break; //if OOV was in history, do not use this N-gram feature and higher orders
hash[a]=PRIMES[0]*PRIMES[1];
for (b=1; b<=a; b++) hash[a]+=PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1); //update hash value based on words from the history
//ME中对class的权值是在syn_d的前半部分,见图
hash[a]=hash[a]%(direct_size/2); //make sure that starting hash index is in the first half of syn_d (second part is reserved for history->words features)
}out loop 1st: a = 0; a < 4 b = 0; hash[0]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371; inner loop 1st: b = 1; b<=0 退出内循环 hash[0]=hash[0]%(direct_size/2) out loop 2nd: a = 1; a < 4; b = 0; hash[1]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371; inner loop 1st: b = 1; b <= 1; hash[a]= hash[a] + PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1) = hash[1] + PRIMES[(1*PRIMES[1]+1)%36]*(history[0]+1); 退出内循环 hash[1]=hash[1]%(direct_size/2); out loop 3rd: a = 2; a < 4; b = 0; hash[2]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371; inner loop 1st: b = 1; b <= 2; hash[2]= hash[2] + PRIMES[(2*PRIMES[1]+1)%36]*(history[0]+1); inner loop 2nd: b = 2; b <= 2; hash[2]= hash[2] + PRIMES[(2*PRIMES[2]+2)%PRIMES_SIZE]*(history[1]+1) 退出内循环
//ME部分,计算在class层的概率分布,即P(c i |s(t))
for (a=vocab_size; a<layer2_size; a++) {
for (b=0; b<direct_order; b++) if (hash[b]) {
neu2[a].ac+=syn_d[hash[b]]; //apply current parameter and move to the next one
//这里解释一下,i+1元特征与输出层所连接的参数是放在syn_d中
//是连续的,这里连续的长度分两种情况,一种是对class计算的,有class_size的长度
//另一种是对word的,连续的长度是word所对应类别的词数
//后面类似的代码同理
hash[b]++; //todo
} else break;
}
}
//activation 2 --softmax on classes
// 20130425 - this is now a 'safe' softmax
//这里softmax归一概率
//这种方式主要是防止溢出,比如当ac值过大,exp(ac)可能就会溢出
sum=0;
real maxAc=-FLT_MAX; //FLT_MAX表示float最大值
for (a=vocab_size; a<layer2_size; a++)
if (neu2[a].ac>maxAc) maxAc=neu2[a].ac; //this prevents the need to check for overflow
for (a=vocab_size; a<layer2_size; a++)
sum+=fasterexp(neu2[a].ac-maxAc);
for (a=vocab_size; a<layer2_size; a++)
neu2[a].ac=fasterexp(neu2[a].ac-maxAc)/sum;
if (gen>0) return; //if we generate words, we don't know what current word is -> only classes are estimated and word is selected in testGen()
//1->2 word
if (word!=-1) {
//置word所在类别的所有词的ac值为0
for (c=0; c<class_cn[vocab[word].class_index]; c++) neu2[class_words[vocab[word].class_index][c]].ac=0;
//计算待预测词的概率分布,这个概率分布是在word所在类别中的词上的
if (layerc_size>0) {
//word所在类别的词与压缩层的计算
matrixXvector(neu2, neuc, sync, layerc_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0]+class_cn[vocab[word].class_index], 0, layerc_size, 0);
}
else
{
//word所在类别的词与隐层的计算
matrixXvector(neu2, neu1, syn1, layer1_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0]+class_cn[vocab[word].class_index], 0, layer1_size, 0);
}
}
//apply direct connections to words
if (word!=-1) if (direct_size>0) {
//ME中计算n元模型特征,这个是在word上的
unsigned long long hash[MAX_NGRAM_ORDER];
for (a=0; a<direct_order; a++) hash[a]=0;
for (a=0; a<direct_order; a++) {
b=0;
if (a>0) if (history[a-1]==-1) break;
hash[a]=PRIMES[0]*PRIMES[1]*(unsigned long long)(vocab[word].class_index+1);
for (b=1; b<=a; b++) hash[a]+=PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1);
hash[a]=(hash[a]%(direct_size/2))+(direct_size)/2;
}
//计算ME部分,待预测词的概率分布,这个概率分布是在word所在类别中的词上的
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
//这里的代码细节和上面是类似的
for (b=0; b<direct_order; b++) if (hash[b]) {
neu2[a].ac+=syn_d[hash[b]];
hash[b]++;
hash[b]=hash[b]%direct_size;
} else break;
}
}
//activation 2 --softmax on words
// 130425 - this is now a 'safe' softmax
//这里的归一概率和上面也是一样
sum=0;
if (word!=-1) {
maxAc=-FLT_MAX;
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
if (neu2[a].ac>maxAc) maxAc=neu2[a].ac;
}
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
sum+=fasterexp(neu2[a].ac-maxAc);
}
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
neu2[a].ac=fasterexp(neu2[a].ac-maxAc)/sum; //this prevents the need to check for overflow
}
}
}//网络前向,计算概率分布
void CRnnLM::computeNet(int last_word, int word)
{
//last_word表示当前输入层所在的词
//word表示要预测的词
int a, b, c;
real val;
double sum; //sum is used for normalization: it's better to have larger precision as many numbers are summed together here
//将last_word对应的神经元ac值为1,也可以看做是对该词的1-of-V的编码
if (last_word!=-1) neu0[last_word].ac=1;
//下面计算输入到隐层的部分
for (a=0; a<layer1_size; a++) neu1[a].ac=0;
for (a=0; a<layerc_size; a++) neuc[a].ac=0;
//这里计算的是s(t-1)与syn0的乘积
matrixXvector(neu1, neu0, syn0, layer0_size, 0, layer1_size, layer0_size-layer1_size, layer0_size, 0);
//这里计算将last_word编码后的向量(大小是vocab_size,分量只有一个为1,其余为0)与syn0的乘积
for (b=0; b<layer1_size; b++) {
a=last_word;
if (a!=-1) neu1[b].ac += neu0[a].ac * syn0[a+b*layer0_size].weight;
}
//这里计算将上面隐层所得到的输入(ac值)经过sigmoid函数的映射结果
for (a=0; a<layer1_size; a++) {
//为数值稳定,将ac值大小限制在[-50,50]
//论文中有提到模型的参数小一些泛化的结果好一些
if (neu1[a].ac>50) neu1[a].ac=50;
if (neu1[a].ac<-50) neu1[a].ac=-50;
val=-neu1[a].ac;
//fasterexp函数在fasexp.h中实现,应该比math.h中的exp快吧
neu1[a].ac=1/(1+fasterexp(val)); //sigmoid函数即1/(1+e^(-x))
}
if (layerc_size>0) {
//计算隐层到压缩层的结果
matrixXvector(neuc, neu1, syn1, layer1_size, 0, layerc_size, 0, layer1_size, 0);
//和上面类似,这里计算的是压缩层
for (a=0; a<layerc_size; a++) {
if (neuc[a].ac>50) neuc[a].ac=50; //for numerical stability
if (neuc[a].ac<-50) neuc[a].ac=-50; //for numerical stability
val=-neuc[a].ac;
neuc[a].ac=1/(1+fasterexp(val));
}
}
//1->2 class
//输出层class_size部分ac值置0
for (b=vocab_size; b<layer2_size; b++) neu2[b].ac=0;
//计算压缩层到class层(输出层的一部分)
if (layerc_size>0) {
matrixXvector(neu2, neuc, sync, layerc_size, vocab_size, layer2_size, 0, layerc_size, 0);
}
else
{
//无压缩层,直接计算隐层到输出层
matrixXvector(neu2, neu1, syn1, layer1_size, vocab_size, layer2_size, 0, layer1_size, 0);
}
//最大熵模型n元特征计算
if (direct_size>0) {
//注意这是hash定义在if内的,也就是出了if外面就无法访问了
//下面会看到每次都单独定义了局部的hash
//hash[i]里面存放的是i+1元模型的特征在syn_d中对应的下标
unsigned long long hash[MAX_NGRAM_ORDER]; //this will hold pointers to syn_d that contains hash parameters
for (a=0; a<direct_order; a++) hash[a]=0;
//下面就是将n元特征单独映射为一个值,这里的权值是针对class部分的
for (a=0; a<direct_order; a++) {
b=0;
if (a>0) if (history[a-1]==-1) break; //if OOV was in history, do not use this N-gram feature and higher orders
hash[a]=PRIMES[0]*PRIMES[1];
for (b=1; b<=a; b++) hash[a]+=PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1); //update hash value based on words from the history
//ME中对class的权值是在syn_d的前半部分,见图
hash[a]=hash[a]%(direct_size/2); //make sure that starting hash index is in the first half of syn_d (second part is reserved for history->words features)
}
/*我们把这段代码展开细走一下,假设direct_order = 3,并且没有OOV
out loop 1st:
a = 0; a < 4
b = 0;
hash[0]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371;
inner loop 1st:
b = 1; b<=0
退出内循环
hash[0]=hash[0]%(direct_size/2)
out loop 2nd:
a = 1; a < 4;
b = 0;
hash[1]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371;
inner loop 1st:
b = 1; b <= 1;
hash[a]= hash[a] + PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1)
= hash[1] + PRIMES[(1*PRIMES[1]+1)%36]*(history[0]+1);
退出内循环
hash[1]=hash[1]%(direct_size/2);
out loop 3rd:
a = 2; a < 4;
b = 0;
hash[2]=PRIMES[0]*PRIMES[1] = 108641969 * 116049371;
inner loop 1st:
b = 1; b <= 2;
hash[2]= hash[2] + PRIMES[(2*PRIMES[1]+1)%36]*(history[0]+1);
inner loop 2nd:
b = 2; b <= 2;
hash[2]= hash[2] + PRIMES[(2*PRIMES[2]+2)%PRIMES_SIZE]*(history[1]+1)
退出内循环
大概能看出,hash[i]表示i+1元模型的历史映射,因为在计算hash[i]时,考虑了history[0..i-1]
这个映射结果是作为syn_d数组的下标,i+1元词作为特征与输出层的连接真正的参数值在syn_d中
*/
//ME部分,计算在class层的概率分布,即P(c i |s(t))
for (a=vocab_size; a<layer2_size; a++) {
for (b=0; b<direct_order; b++) if (hash[b]) {
neu2[a].ac+=syn_d[hash[b]]; //apply current parameter and move to the next one
//这里解释一下,i+1元特征与输出层所连接的参数是放在syn_d中
//是连续的,这里连续的长度分两种情况,一种是对class计算的,有class_size的长度
//另一种是对word的,连续的长度是word所对应类别的词数
//后面类似的代码同理
hash[b]++; //todo
} else break;
}
}
//activation 2 --softmax on classes
// 20130425 - this is now a 'safe' softmax
//这里softmax归一概率
//这种方式主要是防止溢出,比如当ac值过大,exp(ac)可能就会溢出
sum=0;
real maxAc=-FLT_MAX; //FLT_MAX表示float最大值
for (a=vocab_size; a<layer2_size; a++)
if (neu2[a].ac>maxAc) maxAc=neu2[a].ac; //this prevents the need to check for overflow
for (a=vocab_size; a<layer2_size; a++)
sum+=fasterexp(neu2[a].ac-maxAc);
for (a=vocab_size; a<layer2_size; a++)
neu2[a].ac=fasterexp(neu2[a].ac-maxAc)/sum;
if (gen>0) return; //if we generate words, we don't know what current word is -> only classes are estimated and word is selected in testGen()
//1->2 word
if (word!=-1) {
//置word所在类别的所有词的ac值为0
for (c=0; c<class_cn[vocab[word].class_index]; c++) neu2[class_words[vocab[word].class_index][c]].ac=0;
//计算待预测词的概率分布,这个概率分布是在word所在类别中的词上的
if (layerc_size>0) {
//word所在类别的词与压缩层的计算
matrixXvector(neu2, neuc, sync, layerc_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0]+class_cn[vocab[word].class_index], 0, layerc_size, 0);
}
else
{
//word所在类别的词与隐层的计算
matrixXvector(neu2, neu1, syn1, layer1_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0]+class_cn[vocab[word].class_index], 0, layer1_size, 0);
}
}
//apply direct connections to words
if (word!=-1) if (direct_size>0) {
//ME中计算n元模型特征,这个是在word上的
unsigned long long hash[MAX_NGRAM_ORDER];
for (a=0; a<direct_order; a++) hash[a]=0;
for (a=0; a<direct_order; a++) {
b=0;
if (a>0) if (history[a-1]==-1) break;
hash[a]=PRIMES[0]*PRIMES[1]*(unsigned long long)(vocab[word].class_index+1);
for (b=1; b<=a; b++) hash[a]+=PRIMES[(a*PRIMES[b]+b)%PRIMES_SIZE]*(unsigned long long)(history[b-1]+1);
hash[a]=(hash[a]%(direct_size/2))+(direct_size)/2;
}
//计算ME部分,待预测词的概率分布,这个概率分布是在word所在类别中的词上的
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
//这里的代码细节和上面是类似的
for (b=0; b<direct_order; b++) if (hash[b]) {
neu2[a].ac+=syn_d[hash[b]];
hash[b]++;
hash[b]=hash[b]%direct_size;
} else break;
}
}
//activation 2 --softmax on words
// 130425 - this is now a 'safe' softmax
//这里的归一概率和上面也是一样
sum=0;
if (word!=-1) {
maxAc=-FLT_MAX;
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
if (neu2[a].ac>maxAc) maxAc=neu2[a].ac;
}
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
sum+=fasterexp(neu2[a].ac-maxAc);
}
for (c=0; c<class_cn[vocab[word].class_index]; c++) {
a=class_words[vocab[word].class_index][c];
neu2[a].ac=fasterexp(neu2[a].ac-maxAc)/sum; //this prevents the need to check for overflow
}
}
}Recurrent neural network language modeling toolkit 源码走读(六)
原文:http://blog.csdn.net/a635661820/article/details/44803071