每日一题：单词统计

时间：2015-03-22 13:42:30 阅读：331 评论：0 收藏：0 [点我收藏+]

一篇文章（或一本书）由很多单词构成，有的时候我们想统计一下文章中单词出现的次数，怎样快速地做呢？《编程珠玑》书中提到的一种方式是使用散列表，本文实现了该方法。
为了简单，单词定义为仅由英文字母构成的，暂时忽略连字符号，所有单词由空格分开，所以先对一篇文章进行预处理：

//返回处理后单词总数
int pre_process(const char* filename)
{
    ifstream input;
    input.open(filename);
    ofstream o;
    o.open("C:/Users/liaojian/Documents/result_no_digital.txt");
    string word;
    int n = 0;
    while(input>>word)
    {
        string s;
        const char* p = word.c_str();

        while(*p)
        {
            char c = *p;
            if(c <= ‘z‘ && c >= ‘a‘ || c <= ‘Z‘ && c >= ‘A‘)
                s += c;
            ++p;
        }
        if(s != "")
        {
            o<<s<<‘ ‘;
            ++n;
        }
    }
    input.close();
    o.close();
    return n;
}

统计过程中需要同时记录单词及其出现次数，并且使用散列表对于不同的单词可能会计算出同样的hash值，为了避免冲突，使用链地址法，所以为每个单词建立如下结构体：

struct word_node
{
    char* str;
    int count;
    word_node* next;
};

散列表每个元素也是一个结构体，记录了该元素下保存了几个单词：

struct list_head
{
    int count;
    word_node* next;
};

使用网络上下载的《圣经》作为输入，该书有70多万个单词（预处理函数统计值），《编程珠玑》书中所使用的《圣经》版本有近80万个单词，去除重复后，约有30万个单词，所以本次实现也可以使用《编程珠玑》里使用的29989作为散列表的长度，hash函数乘数因子为31：

const int NHASH = 29989;
const int MULT = 31;

下面是hash函数：

unsigned int my_hash(const char* str)
{
    const char* p = str;
    unsigned int h = 0;
    while(*p)
    {
        h = MULT*h + *p;
        ++p;
    }
    return h % NHASH;
}

上面的unsigned关键字很重要，可以避免hash值变成负数（主要由溢出引起的）。
定义一个find函数来查找当前处理的单词是否已经存在：

word_node* find(list_head* hash_list,const char* str)
{
    word_node* p = hash_list->next;
    while(p)
    {
        if(strcmp(str,p->str) == 0) return p;
        p = p->next;
    }
    return NULL;
}

如果没有找到，那么就要建立一个新节点，然后将其插入的相应的hash元素后的链表中，每个新节点都插入到表头后第一元素：

void insert(list_head* hash_list,word_node* node)
{
    node->next = hash_list->next;
    hash_list->next = node;
    ++hash_list->count;
}

hash表使用结束后，还要释放内存空间：

void free_list(list_head *word_list,int list_count)
{
    for (int i = 0; i < list_count; ++i)
    {
        word_node* p = word_list[i].next;
        while(p)
        {
            word_node* q = p->next;
            delete p;
            p = q;
        }
    }
    delete []word_list;
    word_list = NULL;
}

接下来就可以进行统计了，首先建立一个hash表，然后读入每一个单词（从预处理后的文件中），对每个单词计算其hash值，找到其应该放置hash表位置，然后检查该位置是否已经包含了该单词，如果包含了，在对应的节点上将count加1，否则为其建立一个word_node新节点，然后将该节点插入到hash表相应元素后的链表中，为了后面排序方便，该函数还记录了出现次数最高的单词的出现次数，其值保存在most_wort_count中：

list_head* count(const char* filename,int &most_word_count)
{
    ifstream input;
    input.open(filename);
    string word;
    list_head* hash_list = new list_head[NHASH];
    memset(hash_list,0,NHASH*sizeof(list_head));
    most_word_count = 1;
    while(input>>word)
    {
        const char* wordptr = word.c_str();
        unsigned int h = my_hash(wordptr);
        word_node* node = find(&hash_list[h],wordptr);
        if(node)
        {
            ++node->count;
            if(node->count > most_word_count)
                most_word_count = node->count;
        }
        else
        {
            node = new word_node;
            node->count = 1;
            node->str = new char[strlen(wordptr) + 1];
            strcpy(node->str,wordptr);
            insert(&hash_list[h],node);
        }
    }
    return hash_list;
}

统计完成后，可以对统计结果进行排序，由于单词出现的最高次数不是很大（对于《圣经》而言是6万左右），出现同样次数的单词也不少，所以使用桶排序非常合适，该函数word_list表示count函数统计得到的hash表，list_count表示hash表的长度，most_word_count表示单词出现的最高次数：

list_head* sort(list_head* word_list,int list_count,int most_word_count)
{
    list_head* sorted_word_list = new list_head[most_word_count];
    memset(sorted_word_list,0,most_word_count*sizeof(list_head));
    for (int i = 0; i < list_count; ++i)
    {
        word_node* p = word_list[i].next;
        while(p)
        {
            word_node* node = new word_node;
            copy(node,p);
            insert(&sorted_word_list[most_word_count - p->count],node);
            p = p->next;
        }
    }
    return sorted_word_list;
}

该函数中的copy函数主要把待排序的节点复制到一个新节点中，然后插入到排序的数组中（当然也可以直接将该节点拿出来，然后插入到排序的数组中）：

void copy(word_node* dst_node,const word_node* src_node)
{
    dst_node->count = src_node->count;
    dst_node->str = new char[strlen(src_node->str) + 1];
    strcpy(dst_node->str,src_node->str);
}

测试代码如下（预处理的代码不在贴出）：

int _tmain(int argc, _TCHAR* argv[])
{
    char* src_file_name = "The Bible.txt";
    char* dst_file_name = "count_result_no_digital_sorted.txt";
    ofstream o;
    o.open(dst_file_name);
    int most_word_count;
    list_head* hash_list = count(src_file_name,most_word_count);
    list_head* sorted_word_list = sort(hash_list,NHASH,most_word_count);
    for (int i = 0; i < most_word_count; ++i)
    {
        word_node* p = sorted_word_list[i].next;
        while(p)
        {
            o<<p->str<<‘ ‘<<p->count<<endl;
            p = p->next;
        }
    }
    free_list(hash_list,NHASH);
    free_list(sorted_word_list,most_word_count);
    return 0;
}

程序运行结果如下：
技术分享
未排序未排序之前的结果如下：

程序运行的结果与时间（4s左右，连同排序时间）均与《编程珠玑》里描述的基本一致。

每日一题：单词统计

原文：http://blog.csdn.net/liao_jian/article/details/44536557

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)