放假了把这个改一下,发现确实用单字节压缩的压缩率要高一些,暂时没去管为什么,不过放假静下心来写的话确实效率高很多。
新版详见:http://blog.csdn.net/tookkke/article/details/50575103
今天脑洞大开突然想写一下,明明都要考试了,唉,怎么就管不住这手啊
总之呢,就是根据每种编码的出现频率把等长的编码换成变长的,据说理论上压缩比率是比较高的,可是为什么经检验我这个大部分时候压缩出来的比源文件还大呢?
哈弗曼编码的时候要先做一颗字典树,查找的时候就按照当前一位是0还是1,找到叶子节点就找到了原编码。生成这颗树用一种贪心法,每次选两个出现频率最小的节点出来,连在一个新的节点上,再把该节点加入优先队列,它的频率是两个节点频率之和。
我这里把整棵生成的树都保存在了新文件中,然后同时保留的是新文件中内容转换成的变长编码总长度(bit 为单位,不能保证一定是8的倍数),还有把原编码看成二个字节一组,共65536种状态,因为不能保证字节数一定是偶数,所以新文件又用了二个字节保存它,多出来的就直接添加到新文件末尾了。
这是压缩程序,得到的文件名是原名+.kcps(随便写写也没那么多注释了)
#include <cstdio> #include <cstring> #include <iostream> #include <algorithm> #include <queue> #define MAX_WORD (65536) #define MAX_BYTE (256) using namespace std; typedef unsigned char BYTE; typedef unsigned short WORD; typedef unsigned long DWORD; typedef unsigned long long ULL; ULL cnt[MAX_WORD*2];//number of each WORD WORD exBYTE=MAX_BYTE;//if exBYTE exists,it will less than MAX_BYTE ULL bit_num; FILE *infp,*outfp; ULL kkke[MAX_WORD][2]; int tree_size=MAX_WORD*2; struct the_tree{ int son[2]; }tree[MAX_WORD*2];//root is 1 struct cmp{ bool operator()(int a,int b) { return cnt[a]>cnt[b]; } }; void open_file() { char filename[80],outfilename[80]; printf("please input the original file: "); scanf("%s",filename); strcpy(outfilename,filename); strcat(outfilename,".kcps"); if((infp=fopen(filename,"rb"))==NULL) { printf("fail to open original file\n"); exit(1); } if((outfp=fopen(outfilename,"wb"))==NULL) { printf("fail to create new file\n"); exit(1); } } void close_file() { if(fclose(infp)) { printf("fail to close original file\n"); exit(1); } if(fclose(outfp)) { printf("fail to close new file\n"); } } void read_data() { int len; WORD a; while(len=fread(&a,sizeof(BYTE),sizeof(WORD)/sizeof(BYTE),infp)) { if(len==1) { exBYTE=a%MAX_BYTE; break; } cnt[a+MAX_WORD]++; } } void build_tree() { priority_queue<int,vector<int>,cmp>q; for(int i=MAX_WORD+MAX_WORD-1;i>=MAX_WORD;i--)q.push(i); for(int i=MAX_WORD-1;i;i--) { tree[i].son[0]=q.top();q.pop(); tree[i].son[1]=q.top();q.pop(); cnt[i]=cnt[tree[i].son[0]]+cnt[tree[i].son[1]]; q.push(i); } } void dfs(int k,ULL a,int cnt) { if(k>=MAX_WORD) { k-=MAX_WORD; kkke[k][0]=a; kkke[k][1]=cnt; } else { dfs(tree[k].son[0],a<<1,cnt+1); dfs(tree[k].son[1],(a<<1)|1,cnt+1); } } /**********************/ /***** tree *****/ /***** bit_num *****/ /***** cps *****/ /***** exBYTE *****/ /**********************/ void output() { for(int i=1;i<tree_size;i++)fwrite(&tree[i],sizeof(tree[i]),1,outfp); fseek(infp,0,SEEK_SET); WORD a; BYTE b=(BYTE)0; for(int i=0;i<MAX_WORD;i++)bit_num+=cnt[i+MAX_WORD]*kkke[i][1]; fwrite(&bit_num,sizeof(bit_num),1,outfp); bit_num=0; while(fread(&a,sizeof(BYTE),sizeof(WORD)/sizeof(BYTE),infp)==sizeof(WORD)/sizeof(BYTE)) { for(long long i=kkke[a][1]-1LL;i>=0LL;i--) { if(kkke[a][0]&(1ULL<<i))b+=(BYTE)1<<(bit_num%8); bit_num++; if(bit_num%8==0) { fwrite(&b,sizeof(b),1,outfp); b=0; } } } if(bit_num%8)fwrite(&b,sizeof(b),1,outfp); fwrite(&exBYTE,sizeof(exBYTE),1,outfp); } int main() { open_file(); read_data(); build_tree(); dfs(1,0ULL,0); output(); close_file(); return 0; }
#include <cstdio> #include <cstring> #include <iostream> #include <algorithm> #include <queue> #define MAX_WORD (65536) #define MAX_BYTE (256) using namespace std; typedef unsigned char BYTE; typedef unsigned short WORD; typedef unsigned long DWORD; typedef unsigned long long ULL; ULL cnt[MAX_WORD*2];//number of each WORD WORD exBYTE=MAX_BYTE;//if exBYTE exists,it will less than MAX_BYTE ULL bit_num; FILE *infp,*outfp; ULL kkke[MAX_WORD][2]; int tree_size=MAX_WORD*2; struct the_tree{ int son[2]; }tree[MAX_WORD*2];//root is 1 void open_file() { char filename[80],outfilename[80]; printf("please input the compressed file: "); scanf("%s",filename); if((infp=fopen(filename,"rb"))==NULL) { printf("fail to open compressed file\n"); exit(1); } char *a=strstr(filename,".kcps"); if(a!=NULL)*a='\0'; strcpy(outfilename,"(new)"); strcat(outfilename,filename); if((outfp=fopen(outfilename,"wb"))==NULL) { printf("fail to create new file\n"); exit(1); } } void close_file() { if(fclose(infp)) { printf("fail to close compressed file\n"); exit(1); } if(fclose(outfp)) { printf("fail to close new file\n"); } } /**********************/ /***** tree *****/ /***** bit_num *****/ /***** cps *****/ /***** exBYTE *****/ /**********************/ void read_data() { for(int i=1;i<tree_size;i++)fread(&tree[i],sizeof(tree[i]),1,infp); fread(&bit_num,sizeof(bit_num),1,infp); } void output() { BYTE a; WORD b; int now_node=1; for(int i=0;i<bit_num;i++) { if(i%8==0)fread(&a,sizeof(a),1,infp); if(a&((BYTE)1<<(i%8))) { now_node=tree[now_node].son[1]; } else { now_node=tree[now_node].son[0]; } if(now_node>=MAX_WORD) { b=now_node-MAX_WORD; fwrite(&b,sizeof(b),1,outfp); now_node=1; } } fread(&exBYTE,sizeof(exBYTE),1,infp); if(exBYTE<MAX_BYTE)fwrite(&exBYTE,sizeof(exBYTE),1,outfp); } int main() { open_file(); read_data(); output(); close_file(); return 0; }
高兴的是其实有一次压缩吧10000K压成了9000K,还是挺开心的,虽然只有一次...
原文:http://blog.csdn.net/tookkke/article/details/50529838