今天的项目是与完成python开展,需要使用做关键词检查,筛选分类,使用前c语言做这种事情。有了线索,非常高效,内存小了,检查快。
到达python在,第一个想法是pip基于外观的c语言python特里模块。不幸的是,我们没有找到一个合适的,假设我会用c书写python模块的话。就自己写一个了,可惜我还不具备这个能力。
仅仅能用python写了,性能差一点就差点吧,内存多一点也无所谓了。
用搜索引擎看CSDN上的网友的用python实现的DFA,再參照自己曾经用c语言写过的字典树。有些不大对,就自己写了一个。想象一下假设用C语言是会很高效,并且空间也特别小。
以下是python代码:
class cNode(object): def __init__(self): self.children = None # The encode of word is UTF-8 # The encode of message is UTF-8 class cDfa(object): def __init__(self,lWords): self.root=None self.root=cNode() for sWord in lWords: self.addWord(sWord) # The encode of word is UTF-8 def addWord(self,word): node = self.root iEnd=len(word)-1 for i in xrange(len(word)): if node.children == None: node.children = {} if i!=iEnd: node.children[word[i]]=(cNode(),False) else: node.children[word[i]]=(cNode(),True) elif word[i] not in node.children: if i!=iEnd: node.children[word[i]]=(cNode(),False) else: node.children[word[i]]=(cNode(),True) else: #word[i] in node.children: if i==iEnd: Next,bWord=node.children[word[i]] node.children[word[i]]=(Next,True) node=node.children[word[i]][0] def isContain(self,sMsg): root=self.root iLen=len(sMsg) for i in xrange(iLen): p = root j = i while (j<iLen and p.children!=None and sMsg[j] in p.children): (p,bWord) = p.children[sMsg[j]] if bWord: return True j = j + 1 return False def filter(self,sMsg): lNew=[] root=self.root iLen=len(sMsg) i=0 bContinue=False while i<iLen: p=root j=i while (j<iLen and p.children!=None and sMsg[j] in p.children): (p,bWord) = p.children[sMsg[j]] if bWord: #print sMsg[i:j+1] lNew.append(u'*'*(j-i+1))#keyword替换 i=j+1 bContinue=True break j=j+1 if bContinue: bContinue=False continue lNew.append(sMsg[i]) i=i+1 return ''.join(lNew)
以下是c语言代码trie_tree.h:
#ifndef _TRIE_TREE_H_INCLUDED_ #define _TRIE_TREE_H_INCLUDED_ #define WORD_NUM 256 struct trie_node { struct trie_node *node[WORD_NUM]; int value; int exist; }; struct trie_node *create_trie_node(int value); void trie_tree_insert_word(struct trie_node *root, unsigned char *word); /* return 1 表示存在, return 0表示不存在 */ int tire_word_is_exist(struct trie_node *root, unsigned char *word); void destory_trie_tree(struct trie_node *root); void update_trie_tree(struct trie_node **root, const char *filename); #endif
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <trie_tree.h> struct trie_node *create_trie_node(int value) { struct trie_node * node = calloc(1, sizeof(struct trie_node)); node->value = value; return node; } int tire_word_is_exist(struct trie_node *root, unsigned char *word) { struct trie_node *n = NULL; unsigned char *p = NULL; if (root == NULL) { return 0; } while (*word != 0) { p = word++; n = root; while (*p != 0) { n = n->node[*p]; if (n == NULL) { break; } else if (n->exist == 1) { return 1; } p++; } } return 0; } void trie_tree_insert_word(struct trie_node *root, unsigned char *word) { struct trie_node *n; while (*word != 0) { n = root->node[*word]; if (n == NULL) { n = create_trie_node(*word); root->node[*word] = n; } root = n; word++; } root->exist = 1; } void destroy_trie_tree(struct trie_node *root) { int i; if (root == NULL) { return; } for (i = 0; i < WORD_NUM; i++) { destroy_trie_tree(root->node[i]); } free(root); } void update_trie_tree(struct trie_node **root, const char *filename) { char word[1024]; FILE *fp; char *p; if (*root != NULL) { destroy_trie_tree(*root); } *root = calloc(sizeof(**root),1); fp = fopen(filename, "r"); if (fp == NULL) { printf("file can't open %s\n", filename); return; } while (fgets(word, sizeof(word), fp)) { p = word; while (*p != 0) { if (*p == '\r' || *p == '\n' || *p == ' ') { *p = 0; break; } p++; } trie_tree_insert_word(*root, (unsigned char *)word); } }
版权声明:本文博主原创文章,博客,未经同意不得转载。
原文:http://www.cnblogs.com/lcchuguo/p/4881284.html