参考论文:
Srikant R, Agrawal R.Mining sequential patterns: Generalizations and performance improvements[M].Springer Berlin Heidelberg, 1996.
1. 参考论文描述:
Srikant R, Agrawal R. 提出的算法,在原有aporiori的基础上,引入了3个新的概念来定义频繁模式子序列:
1) 加入时间约束,使得原有的aporiori关注的连续变成了只要满足min_gap和max_gap约束的序列,都算是连续的。
2) 加入time_window_size。使得transaction有新的定义。只要在window_size内的item,都可以认为是在同一个itemset。
3) 加入分类标准。
本文的GSP算法实现步骤如下:
1) 候选集生成
a) 合并阶段: 2个subsequence s1和s2能够合并的标准是,去掉s1中的第一个item,去掉s2中的最后一个item,若此时s1和s2相同,则可以对s1和s2进行合并,即将s2中的最后一个item加入到s1中,其中最后一个item是否为合并在原来s1的最后一个itemset,还是自成一个新的itemset,取决于s2的最后一个item是否原来就是一个单独的itemset。
b) 剪枝阶段:不频繁子序列的超集也不频繁。
2) 候选集计算
a) 减少候选集检验是否满足的个数:在这,使用到了哈希树,即将序列中的第p个item映射在第p层上。然后按照候选集进行检查。
b) 检查语料库中是否有满足要求的特定子序列:此处用到了倒排记录表,而倒排记录表的ID可以是item,也可以是transaction-time。简要步骤就是,首先找到第一个itemset,记录时间,然后对于第二个itemset,在第一个itemset的时间上往下继续查找,当找到时,判断2个transaction_time的时间是否满足min_gap和max_gap,如果不满足,则在第二个itemset的基础上先往回找最近第一个itemset时间,如果此时仍不满足,则在第二个itemset的时间的基础上往后查找,如此forward和 backward的步骤。
3) 分类。
2. 算法实现细节:
大致流程:
读入数据à 建立倒排索引表(item to time)à 产生1-frequency itemsetà 产生2-frequency itemsetà …
源文件Ubuntu下编译: c++ gsp.cpp–o gsp
输入参数: ./gsp –i data.txt–t minSupport -min min_gap –max max_gap
例如: ./gsp –i data.txt–t 0.9 -min 2 -max 4
(1). 数据读入:
读入的数据类型为:
TransactionID itemNum item1 item2 item3…
TransactionID itemNum item1 item2 item3…
其中,其中itemNum为一个itemset的item个数,比如读入:
1 2 3 3
1 1 2
1 2 3 4
2 2 1 2
则此时的sequence分别为 <(3 3) (2) (3 4)> , <(1 2)>,此时根据TransactionID来判断是否在一个sequence中。
(2). 采用STL的vector和map来实现,其中主要的数据结构如下:
// An element in thedata sequence.
struct Itemset
{
std::vector<elemType> item; // store the single itemset‘s items
int timestamp; // record current itemsettimestamp
Itemset(){
timestamp = 0;
}
};
// data sequence.
struct Sequence{
int tid; // transation id
int num; // the number of itemset.
std::vector<Itemset> itemset; // store the itemsets.
Sequence() {
tid = 0;
num = 0;
}
};
(3) 建立倒排索引(inverted index)
函数:voidCreateInvertedIndex(int seqNum)
实现细节:vector<vector<map<elemType,int>>> elem2time, 用此来存储每个sequence中每个item到timestamp的映射。在计算支持度的时候将用到。
(4) 产生1-frequency itemset.
函数: std::map<elemType,int> GenerateOneSequence(int seqNum)
实现细节: 对给定数据扫描,用map来映射每个item,然后对然后累加每个item在某个sequence中出现的次数,计算是否满足min_support.
(5) 产生2-frequncy itemset.
函数:vector<Sequence>GenerateTwoSequence(vector<int> oneSeq)
实现细节: 对于1-frequency itemset, 分别由此得到<(1 2)> 、<(1) (2)>如此的itemset, 然后对得到的所有sequence进行剪枝,剪枝策略是通过计算他们在原来数据中的支持度。
(6) 产生3-more frequency itemset.
函数:vector<Sequence> JoinPhase(vector<Sequence> tmpCandidateSet)
实现细节:按照论文的说法,去掉s1中的第一个item和去掉s2中的最后一个item, 合并成新的sequence. 其中还包含判断加入s2的最后一个item是否自成一个itemset,或者合并在已有的itemset中。
函数:vector<Sequence>PrunePhase(std::vector<Sequence> duplicateSet)
实现细节:调用CountSupport函数,进行计算支持度,如果支持度大于阈值,则加入candidateSet。
函数:intCountSupport(Sequence curSequence)
实现细节:对每个sequence,根据已经建好的倒排记录表(数据集中的items映射到时间),首先找到每个itemset的时间,进行存储在vector中,然后进行dfs查询,判断是否满足min_gap和max_gap,如果满足则是一个满足的k-frequency sequence。
函数:boolDfs(vector<vector<int>>storeTime, int len, int cur, int id)
实现细节: 对已经得到的某个sequence每个itemset的对应时间,递归查找是否满足要求min_gap和max_gap,是的话使当前查询的sequence的支持度加1.
(7) 输出
函数:voidOutput(std::vector<Sequence> candidateSet, int id)
实现细节:输出满足的frequent sequence。
#include "structure.h"
using namespace std;
// data literals.
vector<Sequence> literals;
// inverted index
vector<vector<map<elemType, int>>> elem2time;
vector<Sequence> infrequencySeq;
// argv[0]='data.txt', argv[1] = min_support
// argv[2]=min_gap, argv[3] = max_gap
int main(/*int argc, char *argv[]*/)
{
freopen("100.txt", "r", stdin);
// minimum support
minSupport = 0.9;
// the number of sequence.
seqNum = 0;
// continues min_gap
min_gap = 2;
// continuous max_gap
max_gap = 4;
//Sequence seq;
ReadData(seqNum);
//count support
min_support = seqNum * minSupport;
// create inverted index
CreateInvertedIndex(seqNum);
// record start time.
int tim1 = time(0);
// Implementation of Gsp algorithm;
Gsp(seqNum);
// record the end time.
int tim2 = time(0);
OutputParameter(min_gap, max_gap, minSupport, seqNum, tim1, tim2);
return 0;
}
/*******************************************************************************
* @Reference: Mining Sequence Patterns: Generalizations
* and Performance Improvements Srikant.R, Agrawal.R,
VLDB 96;
* @description: The main strategy by GSP algorithm here is:
* 1. Candidate Generation:
* 1) Join phase
* 2) Prune phase.
* 2. Counting Candidates:
* 1) Reducing the number of candidates that need
* to be checked.
* a) Adding candidate sequences to the hash-tree
* b) Finding the candidates contained in the
* data-sequence
* 2) Checking whether a data-sequence contains a specific
* sequence
* Contains test.
* a) Forward phase
* b) Backward phase
* Here we simply using DFS to implement it.
* 3. Taxonomies.
* @CreateTime: 2014-12-7 10:00
* @LastModifiedTime: 2014-12-9 1:25
*******************************************************************************/
void Gsp(int seqNum)
{
freopen("output.txt", "w", stdout);
map<elemType, int> cand;
// Scan the database to find 1-sequence.
//-----------------------------first scan-----------------------------------
cand = GenerateOneSequence(seqNum);
//--------------------------first scan end----------------------------------
vector<elemType> oneSeq;
// Get 1-frequency sequence
for(map<elemType, int>::iterator iter1 = cand.begin();
iter1 != cand.end(); iter1 ++)
oneSeq.push_back(iter1->first);
// Get 2-frequency sequence;
vector<Sequence> candidateSet;
candidateSet = GenerateTwoSequence(oneSeq);
Output(candidateSet, 2);
for(int k = 3;; k ++)
{
candidateSet = JoinPhase(candidateSet);
if(candidateSet.size() == 0) {
cout << "No further " << k-1 << "-frequency patterns." << endl;
break;
}
else{
//candidateSet = PrunePhase(candidateSet);
Output(candidateSet, k);
}
}
}
// First scan the database to get frequency is one itemset.
map<elemType, int> GenerateOneSequence(int seqNum)
{
map<elemType, int> dup;
map<elemType, int> cand;
// count every item in the sequence.
for(int k = 0; k < literals.size(); k ++)
{
map<elemType, int> item;
for(int i = 0; i < literals[k].itemset.size(); i ++)
{
for(int j = 0; j < literals[k].itemset[i].item.size(); j ++)
{
item[literals[k].itemset[i].item[j]] ++;
}
}
map<elemType, int>::iterator iter;
for(iter = item.begin(); iter != item.end(); iter ++)
{
if(iter->second > 0)
dup[iter->first] ++;
}
}
// add up to become support.
map<elemType, int>::iterator iter1, iter2;
for (iter1 = dup.begin(); iter1 != dup.end(); iter1 ++)
if(iter1->second >= min_support)
{
cand[iter1->first] = iter1->second;
}
cout << "min_support = " << minSupport * seqNum << endl;
cout << "1-frequent candidate set" << endl;
for(iter1 = cand.begin(); iter1 != cand.end(); iter1 ++)
printf("<%d>: %d\n", iter1->first, iter1->second);
return cand;
}
// Generate two frequency sequences.
vector<Sequence> GenerateTwoSequence(vector<int> oneSeq)
{
vector<Sequence> candidateSet;
// gather all the two-frequency sequence.
for(int i = 0; i < oneSeq.size(); i ++)
for(int j = 0; j < oneSeq.size(); j ++)
{
// <aa> <ab>
Sequence duplicate;
Itemset tmpItem1, tmpItem2;
tmpItem1.item.push_back(oneSeq[i]);
duplicate.itemset.push_back(tmpItem1);
tmpItem2.item.push_back(oneSeq[j]);
duplicate.itemset.push_back(tmpItem2);
int cnt = CountSupport(duplicate);
if(cnt >= min_support)
candidateSet.push_back(duplicate);
else infrequencySeq.push_back(duplicate);
}
// Generate <(ab)>. <(ac)>
for(int i = 0; i < oneSeq.size() - 1; i ++)
for(int j = i+1; j < oneSeq.size(); j ++)
{
Sequence duplicate;
Itemset tmpItem;
tmpItem.item.push_back(oneSeq[i]);
tmpItem.item.push_back(oneSeq[j]);
duplicate.itemset.push_back(tmpItem);
int cnt = CountSupport(duplicate);
duplicate.num = cnt;
if(cnt >= min_support)
candidateSet.push_back(duplicate);
else infrequencySeq.push_back(duplicate);
}
//return PrunePhase(candidateSet);
return candidateSet;
}
// generate k >= 3-frequency sequences
vector<Sequence> JoinPhase(vector<Sequence> tmpCandidateSet)
{
vector<vector<elemType>> seq;
vector<Sequence> candidateSet;
for(int k = 0; k < tmpCandidateSet.size(); k ++)
{
vector<elemType> items;
for(int i = 0; i < tmpCandidateSet[k].itemset.size(); i ++)
for(int j = 0; j < tmpCandidateSet[k].itemset[i].item.size(); j ++)
items.push_back(tmpCandidateSet[k].itemset[i].item[j]);
seq.push_back(items);
}
for(int i = 0; i < seq.size(); i ++)
for(int j = 0; j < seq.size(); j ++)
{
vector<elemType> s1;
vector<elemType> s2;
for(int p = 1; p < seq[i].size(); p ++)
s1.push_back(seq[i][p]);
for(int q = 0; q < seq[j].size(); q ++)
s2.push_back(seq[j][q]);
bool flag = false;
for(int k = 0; k < s1.size(); k ++)
if(s1[k] != s2[k])
flag = true;
if(!flag){
Sequence tmpSeq;
for(int p = 0; p < tmpCandidateSet[i].itemset.size(); p ++)
{
Itemset tmpItems;
for(int q = 0; q < tmpCandidateSet[i].itemset[p].item.size(); q ++)
tmpItems.item.push_back(tmpCandidateSet[i].itemset[p].item[q]);
tmpSeq.itemset.push_back(tmpItems);
}
int tp = tmpCandidateSet[j].itemset.size()-1;
int tq = tmpCandidateSet[j].itemset[tp].item.size()-1;
int sp = tmpCandidateSet[i].itemset.size()-1;
int sq = tmpCandidateSet[i].itemset[sp].item.size()-1;
if(tmpCandidateSet[j].itemset[tp].item.size() > 1)
{
tmpSeq.itemset[sp].item.push_back(s2[s2.size()-1]);
}
else{
Itemset tmpItems;
tmpItems.item.push_back(s2[s2.size()-1]);
tmpSeq.itemset.push_back(tmpItems);
}
int cnt = CountSupport(tmpSeq);
tmpSeq.num = cnt;
if(cnt >= min_support)
candidateSet.push_back(tmpSeq);
else infrequencySeq.push_back(tmpSeq);
}
}
return candidateSet;
}
// Prune Phase;
// Strategy here, we will use forward phase, and backward phase.
vector<Sequence> PrunePhase(std::vector<Sequence> duplicateSet)
{
vector<Sequence> candidateSet;
for(int k = 0; k < duplicateSet.size(); k ++)
{
Sequence duplicate;
for(int i = 0; i < duplicateSet[k].itemset.size(); i ++)
{
Itemset items;
for(int j = 0; j < duplicateSet[k].itemset[i].item.size(); j ++)
items.item.push_back(duplicateSet[k].itemset[i].item[j]);
duplicate.itemset.push_back(items);
}
int cnt = CountSupport(duplicate);
if(cnt >= min_support)
{
duplicate.num = cnt;
candidateSet.push_back(duplicate);
}
}
return candidateSet;
}
// Count candidate set's support
int CountSupport(Sequence curSequence)
{
int cnt = 0;
//cout << "CountSupport" << endl;
vector<vector<int>> seq;
for(int i = 0; i < curSequence.itemset.size(); i ++)
{
vector<int> subSeq;
for(int j = 0; j < curSequence.itemset[i].item.size(); j ++)
subSeq.push_back(curSequence.itemset[i].item[j]);
seq.push_back(subSeq);
}
for(int k = 0; k < literals.size(); k ++)
{
vector<vector<int>> storeTime;
for(int i = 0; i < seq.size(); i ++)
{
int matchNum;
int t = -1;
vector<int> localTime;
for(int p = 0; p < elem2time[k].size(); p ++)
{
matchNum = 0;
for(int j = 0; j < seq[i].size(); j ++)
if(elem2time[k][p][seq[i][j]] > 0)
{
t = elem2time[k][p][seq[i][j]];
matchNum ++;
}
if(matchNum == seq[i].size())
localTime.push_back(t);
}
if(localTime.size() > 0)
storeTime.push_back(localTime);
}
if(storeTime.size() != seq.size())
continue;
for(int i = 0; i < storeTime[0].size(); i ++)
{
bool flag = false;
flag = Dfs(storeTime, storeTime.size(), storeTime[0][i], 1);
if (flag)
{
cnt ++;
break;
}
}
}
return cnt;
}
// Dfs to find a sequence contain the min_gap and max_gap conditions
bool Dfs(vector<vector<int>>storeTime, int len, int cur, int id)
{
if(id == len)
return true;
for(int i = 0; i < storeTime[id].size(); i ++){
if(storeTime[id][i] - cur >= min_gap &&
storeTime[id][i] - cur <= max_gap)
{
return Dfs(storeTime, storeTime.size(), storeTime[id][i], id + 1);
}
}
return false;
}
// Output k-th candidateSet.
void Output(std::vector<Sequence> candidateSet, int id)
{
int len = candidateSet.size();
if(len != 0)
cout << id << "-frequency candidate set" << endl;
for(int k = 0; k < len; k ++)
{
cout << "<";
for(int i = 0; i < candidateSet[k].itemset.size(); i ++)
{
cout << "(";
for(int j = 0; j < candidateSet[k].itemset[i].item.size(); j ++)
{
cout << candidateSet[k].itemset[i].item[j] << " ";
}
cout << ")";
}
cout << ">: ";
cout << candidateSet[k].num << endl;
}
cout << endl;
}
// Read data from file
void ReadData(/*Sequence *seq, */int &seqNum)
{
int id, num, tmp, t;
int cnt = 1;
map<int, int> transactionID;
while(cin >> id)
{
Sequence seq;
if(transactionID[id] == 0)
{
Itemset items;
cin >> num;
t = 0;
for(int i = 0; i < num; i ++)
{
cin >> tmp;
items.item.push_back(tmp);
}
items.timestamp = t;
t ++;
seq.itemset.push_back(items);
seq.tid = id;
literals.push_back(seq);
transactionID[id] = cnt ++;
}
else{
Itemset items;
cin >> num;
for(int i = 0; i < num; i ++)
{
cin >> tmp;
items.item.push_back(tmp);
}
items.timestamp = t;
t ++;
literals[transactionID[id]-1].itemset.push_back(items);
}
}
seqNum = literals.size();
}
inline int min(int a, int b)
{
return a > b ? b : a;
}
// Create Inverted Index
void CreateInvertedIndex(int seqNum)
{
for(int k = 0; k < seqNum; k ++)
{
vector<map<elemType, int>> invertedIndex;
for(int i = 0; i < literals[k].itemset.size(); i ++)
{
map<elemType, int> tmp;
for(int j = 0; j < literals[k].itemset[i].item.size(); j ++)
tmp[literals[k].itemset[i].item[j]] = i+1;
invertedIndex.push_back(tmp);
}
elem2time.push_back(invertedIndex);
}
}
// output parameters
void OutputParameter(int min_gap, int max_gap, double min_support, int seqNum,
int tim1, int tim2)
{
cout << "---------------------------------" << endl;
cout << "\nmin_gap = " << min_gap << endl;
cout << "max_gap = " << max_gap << endl;
cout << "min_support = " << min_support << endl;
cout << "the number of sequence = " << seqNum << endl;
// output time
cout << "\nbegin time: " << tim1 << '\n';
cout << "end time : " << tim2 << '\n';
cout << "The execution time is:\n" << tim2-tim1;
cout << "\n\nEnd the program" << endl;
cout << "---------------------------------" << endl;
}
GSP Algorithm: Sequence Mining.
原文:http://blog.csdn.net/zone_programming/article/details/42032309