最近做数据挖掘相关的工作,题目是时间序列聚类研究,目前对于这方面的研究都还只是在起步阶段,被广泛使用的还是基于K-MEDOIDS的聚类,放弃K-MEANS的主要原因还是时间序列之间序列的计算难度,对于这方面我们也已经有了一定的进展,不过也还是有很多的问题。
把基于DTW与K-MEDOIDS的时间序列聚类的算法贴出来,希望对大家有些帮助吧。
这份代码是我在以前的代码的基础上直接改的,所以C和C++有些混用。
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <iostream>
- using namespace std;
- #define NA 60 /* 数据维数 */
- #define K 6 /* 聚类数 */
- #define Psize 36 /* 种群大小 */
- #define T 30 /* 最大迭代数 */
- #define ED 0.0000001 /* 结束条件 */
- #define Min 1000000 /*最小值*/
- #define MinCmp(a,b) (a<b?a:b)
- #define INF 300000000
- typedef struct {
- double p[NA];
- double distance[K];
- }Point;
- typedef struct {
- Point clu_cent[K];
- int cluster[K][Psize];
- int cluster_num[K];
- double fitness;
- double old_fitness;
- double Je;
- }Pop;
-
- int Is_equal(int a[], int n, int b);
- double Euclid1(double x, double y);
- double dtw(int x, int y);
- void input_data();
- void Init_center();
- void calculate_distance();
- void Make_new_cluster();
- void Make_new_center();
- void output_info(int flag);
- Point all_data[Psize];
- Pop pop;
- double Dtwdistance(Point x, Point y)
- {
- double distance[NA+1][NA+1];
- double output[NA+1][NA+1];
- int i,j;
- memset(distance,0,sizeof(distance));
- memset(output,0,sizeof(output));
- for (i=1;i<=NA;i++)
- for (j=1;j<=NA;j++)
- distance[i][j]=Euclid1(x.p[i-1],y.p[j-1]);
- output[1][1]=distance[1][1];
- for (i=0;i<=NA;i++)
- {
- output[i][0]=INF;
- output[0][i]=INF;
- }
- for (i=2;i<=NA;i++)
- output[i][1]=output[i-1][1]+distance[i][1];
- for (i=2;i<=NA;i++)
- output[1][i]=output[1][i-1]+distance[1][i];
- for(i=2;i<=NA;i++)
- for(j=2;j<=NA;j++)
- output[i][j]=Mintt(Mintt(output[i-1][j-1],output[i][j-1]),output[i-1][j])+distance[i][j];
-
- return output[NA][NA];
- }
-
- double dtw(int a, int b)
- {
- if (a==b)
- return 0;
- return Dtwdistance(all_data[a],all_data[b]);
- }
-
- void input_data()
- {
- int i,j,temp;
- freopen("data.txt","r",stdin);
- for(i = 0; i < Psize; i++)
- {
- cin>>temp;
- for (j=0;j<NA;j++)
- {
- cin>>all_data[i].p[j];
- }
- }
- }
-
- void Init_center()
- {
- int i, j;
- int num = 0;
- int rand_num;
- int rand_num_tmp[K];
-
- while(num < K){
- rand_num = rand() % Psize;
- if(!Is_equal(rand_num_tmp, num, rand_num))
- rand_num_tmp[num++] = rand_num;
- }
- freopen("output.txt","w",stdout);
- cout<<"初始化重心为:"<<endl;
- for(i = 0; i < K; i++){
- cout<<rand_num_tmp[i]<<endl;
- for(j = 0; j < NA; j++)
- {
- cout<<all_data[rand_num_tmp[i]].p[j]<<‘/t‘;
- pop.clu_cent[i].p[j] = all_data[rand_num_tmp[i]].p[j];
- }
- cout<<endl;
- }
- }
- int Is_equal(int a[], int n, int b)
- {
- int i;
- for(i = 0; i < n; i++)
- if(a[i] == b) return 1;
- return 0;
- }
-
- void calculate_distance()
- {
- int i, j;
- for(i = 0; i < Psize; i++)
- for(j = 0; j < K; j++){
- all_data[i].distance[j] = dtw(i, j);
- }
- }
-
- double Euclid1(double x, double y)
- {
- return (y-x)*(y-x);
- }
- void Make_new_cluster()
- {
- int i, j;
- double min;
-
- for(i = 0; i < K; i++)
- pop.cluster_num[i] = 0;
- for(i = 0; i < Psize; i++){
- int index = 0;
- min = all_data[i].distance[0];
- for(j = 1; j < K; j++){
- if(all_data[i].distance[j] < min){
- min = all_data[i].distance[j];
- index = j;
- }
- }
-
- pop.cluster[index][pop.cluster_num[index]++] = i;
- }
-
- pop.Je = 0;
- for(i = 0; i < K; i++)
- for(j = 0; j < pop.cluster_num[i]; j++){
-
- pop.Je +=pow(all_data[pop.cluster[i][j]].distance[i],2);
- }
- pop.old_fitness = pop.fitness;
- pop.fitness = pop.Je;
- }
- void Make_new_center()
- {
- int i, j, t,n;
- double tmp_sum;
- int index;
- double min=Min;
- for(i = 0; i < K; i++)
- {
- tmp_sum = 0;
- for(j= 0;j< pop.cluster_num[i];j++)
- for (t=0;t<pop.cluster_num[i];t++)
- {
- tmp_sum =dtw(pop.cluster[i][j],pop.cluster[i][t]);
- if(tmp_sum<min)
- min=tmp_sum;
- index=j;
- }
- for (n=0;n<NA;n++)
- pop.clu_cent[i].p[n]=all_data[pop.cluster[i][index]].p[n];
-
- }
- }
-
- void output_info(int flag)
- {
- int i, j, n;
- for(i = 0; i < K; i++){
- if(flag == 0){
- cout<<"初始化质心:"<<i+1<<‘/t‘<<pop.cluster_num[i]<<endl;
- for(n = 0; n < NA; n++)
- cout<<pop.clu_cent[i].p[n]<<‘/t‘;
- cout<<endl;
- }
- else if(flag == 1){
- cout<<"最终质心:"<<i+1<<endl;
- for(n = 0; n < NA; n++)
- cout<<pop.clu_cent[i].p[n]<<‘/t‘;
- cout<<endl;
- }
- cout<<"簇类:" <<i + 1<<endl;
- for(j = 0; j < pop.cluster_num[i]; j++){
- cout<<pop.cluster[i][j]<<‘/t‘;
- for(n = 0; n < NA; n++)
- cout<<all_data[pop.cluster[i][j]].p[n]<<‘/t‘;
- cout<<endl;
- }
- }
- }
- int main()
- {
- int i,j;
- double differ = 1;
- int flag = 0;
- freopen("data.txt","r",stdin);
- freopen("output.txt","w",stdout);
- input_data();
- Init_center();
- for(i = 0; (i < T) && (differ > ED); i++){
- calculate_distance();
- Make_new_cluster();
-
- if(flag == 0){
- output_info(flag);
- flag = 1;
- }
- Make_new_center();
- differ = pop.old_fitness - pop.fitness;
- differ = fabs(differ);
-
-
-
- }
- printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++/n");
- output_info(flag);
- return 0;
- }
通过K-MEDOIDS算法对时间序列进行聚类的实现
原文:http://www.cnblogs.com/lingtianyulong/p/4219778.html