数据挖掘: Apriori算法
数据挖掘: Apriori算法1 概述1.1 Apriori算法是由Rakesh Agrawal和Ramakrishnan Srikan发明的一种数据挖掘算法。最初是解决找到transaction数据库中不同item关联规则的问题。2 算法2.1 基本概念I = {i1, i2, ..., im} 是由m个item组成的集合;D是多个transact
·
数据挖掘: Apriori算法
1 概述
1.1 Apriori算法是由Rakesh Agrawal和Ramakrishnan Srikan
发明的一种数据挖掘算法。最初是解决找到transaction数据库中
不同item关联规则的问题。
2 算法
2.1 基本概念
I = {i1, i2, ..., im} 是由m个item组成的集合;
D是多个transaction组成的集合;D中的每个transaction T都是item组成的集合,
每一个T都有一个唯一的标示:TID;
关联规则:X, Y是T的子集,且X和Y的交集为空, 则X->Y是一个关联规则;
支持度:关联规则X->Y存在一个支持度s, 如果D中有s%的transaction含有
X和Y;
2.2 算法流程
假设每一个transaction有多个item组成,其中的item都是排序的;
每个transaction包括两个部分:TID和item ;
k-itemset是个集合,它的元素是由k个item组成itemset,其中的item是排序的,itemset也是排序的;
L[k]用于存放较大的k-itemset, 每个元素包括两个部分:itemset和支持度;
C[k]用于存放k-itemset的候选集合,每个元素包括itemset和TID;
输入:transaction集合
输出:L[k]
1) 初始化候选集合C[k]
2) 分别计算每个itemset的数量
3) 找到数量最小的itemset
4) 在C[k]中删除数量最小的itemset,把剩下的itemset加入到k-itemset
5) 将k-itemset存放到L[k]中
6) 重新生成候选集合C[k],若C[k]的元素数量大于1,则执行1),否则退出
3 算法的简单实现
3.1 后面的代码只是演示算法的C++原型程序,设计和编写上有很多不适当的地方。
没有经过仔细的测试,但基本是正确的。main函数中的例子来自于Rakesh Agrawal
和Ramakrishnan Srikan的论文。
编译:
g++ -g -W -Wall -Wextra -o mytest apriori.cpp main.cpp
执行:
./mytest
3.2 代码
apriori.h:
=========================================================
// 2012年 08月 01日 星期三 13:46:13 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#ifndef APRIORI_H
#define APRIORI_H
#include <map>
#include <vector>
typedef std::map<std::vector<int>, int> associate_rule_t;
class apriori {
public:
apriori();
~apriori();
int push_item(int, const int *, int);
int cal_associate();
void reset_associate_rule();
bool next_associate_rule(associate_rule_t **);
private:
struct transaction {
int tid;
std::vector<std::vector<int> > items;
};
typedef std::vector<struct transaction *> transactions_t;
typedef std::vector<associate_rule_t *> itemset_t;
private:
void clear_rule();
void clear_tran();
bool cal_nassociate();
int recal_transaction();
void cal_support(associate_rule_t *);
int find_min_support(const associate_rule_t *, int *);
void rm_min_support(associate_rule_t *, int);
private:
transactions_t apr_tran;
itemset_t apr_rules;
itemset_t::iterator current;
};
#endif
==================================================
apriori.cpp:
==================================================
// 2012年 08月 01日 星期三 14:21:08 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#include <algorithm>
#include <utility>
#include <iostream>
#include <cassert>
#include "apriori.h"
using namespace std;
apriori::apriori()
:current(apr_rules.end())
{
}
apriori::~apriori()
{
clear_tran();
clear_rule();
}
int apriori::push_item(int tid, const int *items, int s)
{
struct transaction *t = new struct transaction;
t->tid = tid;
for(int i = 0; i < s; ++i)
t->items.push_back(vector<int>(1, items[i]));
sort(t->items.begin(), t->items.end());
apr_tran.push_back(t);
return 0;
}
int apriori::cal_associate()
{
while(cal_nassociate())
recal_transaction();
return 0;
}
void apriori::cal_support(associate_rule_t *rules)
{
for(vector<struct transaction *>::const_iterator pos
= apr_tran.begin(); pos != apr_tran.end(); ++pos) {
const vector<vector<int> > &items = (*pos)->items;
//cout << __LINE__ << endl;
for(vector<vector<int> >::const_iterator p = items.begin();
p != items.end(); ++p) {
//cout << __LINE__ << endl;
/*for(vector<int>::const_iterator ppp = p->begin();
ppp != p->end(); ++ppp)
cout << *ppp << ' ';*/
//cout << " : ";
++(*rules)[*p];
//cout << (*rules)[*p] << endl;
//cout << "-----------------" << endl;
}
}
assert(rules->size());
}
int apriori::find_min_support(const associate_rule_t *rules, int *c)
{
int min = rules->begin()->second;
*c = 0; // XXX
for(associate_rule_t::const_iterator pos = rules->begin();
pos != rules->end(); ++pos) {
if(pos->second == min) {
++*c;
} else if(pos->second < min) {
min = pos->second;
*c = 1;
}
}
return min;
}
bool apriori::cal_nassociate()
{
//cout << __FILE__ << endl;
if(!apr_tran.size())
return false;
associate_rule_t *rules = new associate_rule_t;
cal_support(rules);
const int s = rules->size();
if(s < 1)
return false;
int c;
const int min = find_min_support(rules, &c);
if(c < s)
rm_min_support(rules, min);
if(rules->size())
apr_rules.push_back(rules);
else
delete rules;
if(apr_tran.size())
return true;
return false;
}
void apriori::rm_min_support(associate_rule_t *rules, int min)
{
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end();) {
vector<vector<int> > &items = (*pos)->items;
for(vector<vector<int> >::iterator p = items.begin();
p != items.end();) {
/*for(vector<int>::iterator ppp = p->begin();
ppp != p->end(); ++ppp)
cout << *ppp << ' ';*/
//cout << "\n=================" << endl;
associate_rule_t::iterator f = rules->find(*p);
//assert(f != rules->end());
if(f != rules->end() && f->second == min) {
//++p;
items.erase(p);
rules->erase(f);
continue;
} else if(f == rules->end()) {
items.erase(p);
continue;
}
++p;
}
if(!items.size()) {
delete *pos;
apr_tran.erase(pos);
continue;
}
++pos;
}
}
void apriori::clear_rule()
{
for(itemset_t::iterator pos = apr_rules.begin();
pos != apr_rules.end(); ++pos)
delete *pos;
apr_rules.clear();
}
void apriori::clear_tran()
{
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end(); ++pos)
delete *pos;
apr_tran.clear();
}
int apriori::recal_transaction()
{
assert(apr_tran.size());
const int s = (*apr_tran.begin())->items.begin()->size();
//cout << "S: " << s << endl;
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end();) {
vector<vector<int> > &items = (*pos)->items;
if(items.size() > 1) {
struct transaction *t = new struct transaction;
t->tid = (*pos)->tid;
for(vector<vector<int> >::iterator p1 = items.begin();
p1 != items.end() - 1; ++p1) {
for(vector<vector<int> >::iterator p2
= p1 + 1; p2 != items.end(); ++p2) {
int i = 0;
vector<int> tmp;
for(vector<int>::iterator p1p = p1->begin(),
p2p = p2->begin(); i < s
&& p1p != p1->end(); ++i, ++p1p, ++p2p) {
/*cout << "p1p: " << *p1p << endl;
cout << "p2p: " << *p2p << endl;
cout << "I: " << i << endl;*/
if(*p1p == *p2p) {
tmp.push_back(*p1p);
} else if(*p1p < *p2p && i == s - 1) {
tmp.push_back(*p1p);
tmp.push_back(*p2p);
} else if(*p1p > *p2p && i == s - 1) {
tmp.push_back(*p2p);
tmp.push_back(*p1p);
} else {
break;
}
}
if(i == s) {
/*for(vector<int>::iterator p
= tmp.begin(); p != tmp.end(); ++p)
cout << *p << ' ';
cout << endl;
cout << "++++++++++++++++++\n";*/
t->items.push_back(tmp);
}
}
}
delete *pos;
if(t->items.size()) {
*pos = t;
++pos;
} else {
apr_tran.erase(pos);
}
continue;
} else {
delete *pos;
apr_tran.erase(pos);
}
}
return 0;
}
void apriori::reset_associate_rule()
{
current = apr_rules.begin();
}
bool apriori::next_associate_rule(associate_rule_t **p)
{
if(current == apr_rules.end())
return false;
*p = *current++;
return true;
}
========================================================
main.cpp:
========================================================
// 2012年 08月 07日 星期二 09:17:28 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#include <iostream>
#include <vector>
#include "apriori.h"
using namespace std;
int main()
{
apriori apr;
int item[7] = {1, 3, 4};
apr.push_item(100, item, 3);
item[0] = 2; item[1] = 3; item[2] = 5;
apr.push_item(200, item, 3);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(300, item, 4);
item[0] = 2; item[1] = 5;
apr.push_item(400, item, 2);
/*item[0] = 1; item[1] = 2; item[2] = 4; item[3] = 7;
apr.push_item(500, item, 4);
item[0] = 1; item[1] = 2; item[2] = 3;
apr.push_item(600, item, 3);
item[0] = 1; item[1] = 2; item[2] = 4; item[3] = 7;
apr.push_item(700, item, 4);
item[0] = 1; item[1] = 2;
apr.push_item(800, item, 2);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(700, item, 4);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5; item[4] = 6; item[5] = 7;
apr.push_item(900, item, 6);
item[0] = 5; item[1] = 6; item[2] = 7;
apr.push_item(110, item, 3);
item[0] = 2; item[1] = 3; item[2] = 7;
apr.push_item(120, item, 3);
item[0] = 1; item[1] = 4; item[2] = 7;
apr.push_item(128, item, 3);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(101, item, 4);*/
//
apr.cal_associate();
associate_rule_t *ar;
apr.reset_associate_rule();
while(apr.next_associate_rule(&ar)) {
for(associate_rule_t::iterator p = ar->begin();
p != ar->end(); ++p) {
const vector<int> &items = p->first;
for(vector<int>::const_iterator pp = items.begin();
pp != items.end(); ++pp)
cout << *pp << ' ';
cout << ": ";
cout << p->second << endl;
}
cout << "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n";
}
return 0;
}
==========================================================
1 概述
1.1 Apriori算法是由Rakesh Agrawal和Ramakrishnan Srikan
发明的一种数据挖掘算法。最初是解决找到transaction数据库中
不同item关联规则的问题。
2 算法
2.1 基本概念
I = {i1, i2, ..., im} 是由m个item组成的集合;
D是多个transaction组成的集合;D中的每个transaction T都是item组成的集合,
每一个T都有一个唯一的标示:TID;
关联规则:X, Y是T的子集,且X和Y的交集为空, 则X->Y是一个关联规则;
支持度:关联规则X->Y存在一个支持度s, 如果D中有s%的transaction含有
X和Y;
2.2 算法流程
假设每一个transaction有多个item组成,其中的item都是排序的;
每个transaction包括两个部分:TID和item ;
k-itemset是个集合,它的元素是由k个item组成itemset,其中的item是排序的,itemset也是排序的;
L[k]用于存放较大的k-itemset, 每个元素包括两个部分:itemset和支持度;
C[k]用于存放k-itemset的候选集合,每个元素包括itemset和TID;
输入:transaction集合
输出:L[k]
1) 初始化候选集合C[k]
2) 分别计算每个itemset的数量
3) 找到数量最小的itemset
4) 在C[k]中删除数量最小的itemset,把剩下的itemset加入到k-itemset
5) 将k-itemset存放到L[k]中
6) 重新生成候选集合C[k],若C[k]的元素数量大于1,则执行1),否则退出
3 算法的简单实现
3.1 后面的代码只是演示算法的C++原型程序,设计和编写上有很多不适当的地方。
没有经过仔细的测试,但基本是正确的。main函数中的例子来自于Rakesh Agrawal
和Ramakrishnan Srikan的论文。
编译:
g++ -g -W -Wall -Wextra -o mytest apriori.cpp main.cpp
执行:
./mytest
3.2 代码
apriori.h:
=========================================================
// 2012年 08月 01日 星期三 13:46:13 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#ifndef APRIORI_H
#define APRIORI_H
#include <map>
#include <vector>
typedef std::map<std::vector<int>, int> associate_rule_t;
class apriori {
public:
apriori();
~apriori();
int push_item(int, const int *, int);
int cal_associate();
void reset_associate_rule();
bool next_associate_rule(associate_rule_t **);
private:
struct transaction {
int tid;
std::vector<std::vector<int> > items;
};
typedef std::vector<struct transaction *> transactions_t;
typedef std::vector<associate_rule_t *> itemset_t;
private:
void clear_rule();
void clear_tran();
bool cal_nassociate();
int recal_transaction();
void cal_support(associate_rule_t *);
int find_min_support(const associate_rule_t *, int *);
void rm_min_support(associate_rule_t *, int);
private:
transactions_t apr_tran;
itemset_t apr_rules;
itemset_t::iterator current;
};
#endif
==================================================
apriori.cpp:
==================================================
// 2012年 08月 01日 星期三 14:21:08 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#include <algorithm>
#include <utility>
#include <iostream>
#include <cassert>
#include "apriori.h"
using namespace std;
apriori::apriori()
:current(apr_rules.end())
{
}
apriori::~apriori()
{
clear_tran();
clear_rule();
}
int apriori::push_item(int tid, const int *items, int s)
{
struct transaction *t = new struct transaction;
t->tid = tid;
for(int i = 0; i < s; ++i)
t->items.push_back(vector<int>(1, items[i]));
sort(t->items.begin(), t->items.end());
apr_tran.push_back(t);
return 0;
}
int apriori::cal_associate()
{
while(cal_nassociate())
recal_transaction();
return 0;
}
void apriori::cal_support(associate_rule_t *rules)
{
for(vector<struct transaction *>::const_iterator pos
= apr_tran.begin(); pos != apr_tran.end(); ++pos) {
const vector<vector<int> > &items = (*pos)->items;
//cout << __LINE__ << endl;
for(vector<vector<int> >::const_iterator p = items.begin();
p != items.end(); ++p) {
//cout << __LINE__ << endl;
/*for(vector<int>::const_iterator ppp = p->begin();
ppp != p->end(); ++ppp)
cout << *ppp << ' ';*/
//cout << " : ";
++(*rules)[*p];
//cout << (*rules)[*p] << endl;
//cout << "-----------------" << endl;
}
}
assert(rules->size());
}
int apriori::find_min_support(const associate_rule_t *rules, int *c)
{
int min = rules->begin()->second;
*c = 0; // XXX
for(associate_rule_t::const_iterator pos = rules->begin();
pos != rules->end(); ++pos) {
if(pos->second == min) {
++*c;
} else if(pos->second < min) {
min = pos->second;
*c = 1;
}
}
return min;
}
bool apriori::cal_nassociate()
{
//cout << __FILE__ << endl;
if(!apr_tran.size())
return false;
associate_rule_t *rules = new associate_rule_t;
cal_support(rules);
const int s = rules->size();
if(s < 1)
return false;
int c;
const int min = find_min_support(rules, &c);
if(c < s)
rm_min_support(rules, min);
if(rules->size())
apr_rules.push_back(rules);
else
delete rules;
if(apr_tran.size())
return true;
return false;
}
void apriori::rm_min_support(associate_rule_t *rules, int min)
{
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end();) {
vector<vector<int> > &items = (*pos)->items;
for(vector<vector<int> >::iterator p = items.begin();
p != items.end();) {
/*for(vector<int>::iterator ppp = p->begin();
ppp != p->end(); ++ppp)
cout << *ppp << ' ';*/
//cout << "\n=================" << endl;
associate_rule_t::iterator f = rules->find(*p);
//assert(f != rules->end());
if(f != rules->end() && f->second == min) {
//++p;
items.erase(p);
rules->erase(f);
continue;
} else if(f == rules->end()) {
items.erase(p);
continue;
}
++p;
}
if(!items.size()) {
delete *pos;
apr_tran.erase(pos);
continue;
}
++pos;
}
}
void apriori::clear_rule()
{
for(itemset_t::iterator pos = apr_rules.begin();
pos != apr_rules.end(); ++pos)
delete *pos;
apr_rules.clear();
}
void apriori::clear_tran()
{
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end(); ++pos)
delete *pos;
apr_tran.clear();
}
int apriori::recal_transaction()
{
assert(apr_tran.size());
const int s = (*apr_tran.begin())->items.begin()->size();
//cout << "S: " << s << endl;
for(transactions_t::iterator pos = apr_tran.begin();
pos != apr_tran.end();) {
vector<vector<int> > &items = (*pos)->items;
if(items.size() > 1) {
struct transaction *t = new struct transaction;
t->tid = (*pos)->tid;
for(vector<vector<int> >::iterator p1 = items.begin();
p1 != items.end() - 1; ++p1) {
for(vector<vector<int> >::iterator p2
= p1 + 1; p2 != items.end(); ++p2) {
int i = 0;
vector<int> tmp;
for(vector<int>::iterator p1p = p1->begin(),
p2p = p2->begin(); i < s
&& p1p != p1->end(); ++i, ++p1p, ++p2p) {
/*cout << "p1p: " << *p1p << endl;
cout << "p2p: " << *p2p << endl;
cout << "I: " << i << endl;*/
if(*p1p == *p2p) {
tmp.push_back(*p1p);
} else if(*p1p < *p2p && i == s - 1) {
tmp.push_back(*p1p);
tmp.push_back(*p2p);
} else if(*p1p > *p2p && i == s - 1) {
tmp.push_back(*p2p);
tmp.push_back(*p1p);
} else {
break;
}
}
if(i == s) {
/*for(vector<int>::iterator p
= tmp.begin(); p != tmp.end(); ++p)
cout << *p << ' ';
cout << endl;
cout << "++++++++++++++++++\n";*/
t->items.push_back(tmp);
}
}
}
delete *pos;
if(t->items.size()) {
*pos = t;
++pos;
} else {
apr_tran.erase(pos);
}
continue;
} else {
delete *pos;
apr_tran.erase(pos);
}
}
return 0;
}
void apriori::reset_associate_rule()
{
current = apr_rules.begin();
}
bool apriori::next_associate_rule(associate_rule_t **p)
{
if(current == apr_rules.end())
return false;
*p = *current++;
return true;
}
========================================================
main.cpp:
========================================================
// 2012年 08月 07日 星期二 09:17:28 CST
// author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
// K.I.S.S
// S.P.O.T
#include <iostream>
#include <vector>
#include "apriori.h"
using namespace std;
int main()
{
apriori apr;
int item[7] = {1, 3, 4};
apr.push_item(100, item, 3);
item[0] = 2; item[1] = 3; item[2] = 5;
apr.push_item(200, item, 3);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(300, item, 4);
item[0] = 2; item[1] = 5;
apr.push_item(400, item, 2);
/*item[0] = 1; item[1] = 2; item[2] = 4; item[3] = 7;
apr.push_item(500, item, 4);
item[0] = 1; item[1] = 2; item[2] = 3;
apr.push_item(600, item, 3);
item[0] = 1; item[1] = 2; item[2] = 4; item[3] = 7;
apr.push_item(700, item, 4);
item[0] = 1; item[1] = 2;
apr.push_item(800, item, 2);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(700, item, 4);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5; item[4] = 6; item[5] = 7;
apr.push_item(900, item, 6);
item[0] = 5; item[1] = 6; item[2] = 7;
apr.push_item(110, item, 3);
item[0] = 2; item[1] = 3; item[2] = 7;
apr.push_item(120, item, 3);
item[0] = 1; item[1] = 4; item[2] = 7;
apr.push_item(128, item, 3);
item[0] = 1; item[1] = 2; item[2] = 3; item[3] = 5;
apr.push_item(101, item, 4);*/
//
apr.cal_associate();
associate_rule_t *ar;
apr.reset_associate_rule();
while(apr.next_associate_rule(&ar)) {
for(associate_rule_t::iterator p = ar->begin();
p != ar->end(); ++p) {
const vector<int> &items = p->first;
for(vector<int>::const_iterator pp = items.begin();
pp != items.end(); ++pp)
cout << *pp << ' ';
cout << ": ";
cout << p->second << endl;
}
cout << "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n";
}
return 0;
}
==========================================================
更多推荐
所有评论(0)