R语言中的arules包系列为表示、处理和分析事务数据与模式提供了基础架构,基于频繁项集(frequent itemsets)和关联规则(association rules)进行挖掘。
该包不仅实现了多种兴趣度量指标和关联规则挖掘算法,还集成了Christian Borgelt提供的高效Apriori和Eclat C语言实现版本的代码。
rm(list=ls())
library(arulesViz)
library(arules)
data("IncomeESL")
trans <- transactions(IncomeESL)
trans
rules <- apriori(trans, supp = 0.1, conf = 0.9, target = "rules")
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 899
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[84 item(s), 8993 transaction(s)] done [0.00s].
## sorting and recoding items ... [42 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [457 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
trans <- as(trans, "transactions")
print(trans)
## 可视化频率top30的项目
par(family = "STKaiti",cex = 1)
itemFrequencyPlot(trans,
top = 30,
col = "lightblue",
xlab = "samples",
ylab = "Frequency",
main = "")
inspect(head(rules, n = 3, by = "lift"))
# lhs rhs support confidence coverage lift count
# [1] {dual incomes=no,
# householder status=own} => {marital status=married} 0.1016346 0.9713071 0.1046369 2.619965 914
# [2] {years in bay area=>10,
# dual incomes=yes,
# type of home=house} => {marital status=married} 0.1003002 0.9605964 0.1044145 2.591075 902
# [3] {dual incomes=yes,
# householder status=own,
# type of home=house,
# language in home=english} => {marital status=married} 0.1098632 0.9601555 0.1144223 2.589886 988
inspectDT(sort(rules,by = "lift"))
## 对得出的关联规则按照置信度进行排序,也可根据需要根据支持度或者提升度进行排序
myrules.a <- sort(rules, by = "confidence", decreasing = T )
## 排序后查看结果
inspect(myrules.a[1:20])
提取前30的项目
交互式可视化展示
# 设置目标名(可以是多个),进行筛选
# target <- c("occupation=student","number in household=4",
# "householder status=live with parents/family",
# "type of home=apartment",
# "occupation=professional/managerial",
# "householder status=own",
# "dual incomes=not married")
# lhs_labels <- labels(lhs(myrules.a))
# rhs_labels <- labels(rhs(myrules.a))
# # 拆分字符串为项目向量
# split_items <- function(x) {
# x <- gsub("[{}]", "", x) # 去掉大括号
# x <- unlist(strsplit(x, ",")) # 按逗号拆分
# x <- trimws(x) # 去除前后空格
# return(x)
# }
# # 应用到所有规则
# lhs_items <- lapply(lhs_labels, split_items)
# rhs_items <- lapply(rhs_labels, split_items)
#
#
# # 过滤函数:检查每条规则的 lhs/rhs 是否只包含目标名
# is_all_in_target <- function(items, herbs) {
# sapply(items, function(x) all(x %in% herbs))
# }
#
# # 找出 lhs 和 rhs 都完全在目标名称列表中的规则索引
# valid_lhs <- is_all_in_target(lhs_items, target)
# valid_rhs <- is_all_in_target(rhs_items, target)
# valid_rules_index <- which(valid_lhs & valid_rhs)
#
# # 提取规则
# filtered_rules <- myrules.a[valid_rules_index]
#
# # 查看提取到的规则
# inspect(filtered_rules)
#
# write.csv(as(filtered_rules, "data.frame"),
# file = "筛选后目标样本-关联规则.csv", row.names = FALSE)
### 假设不进行过滤
lhs_labels <- labels(lhs(myrules.a))
rhs_labels <- labels(rhs(myrules.a))
rules_df <- as(myrules.a, "data.frame")
head(rules_df)
write.csv(rules_df, "所有关联规则.csv", row.names = FALSE)
library(arulesViz)
plot(myrules.a)
#plot(myrules.a, control = list(jitter = 0))
plot(myrules.a, shading = "order")
plot(myrules.a, method = "grouped")
plot(myrules.a, method = "graph")
plot(myrules.a, method="graph",layout=igraph::in_circle())
plot(myrules.a, method="paracoord")
## 使用可交互的网络图可视化获得的规则
plot(myrules.a, method="graph",engine="htmlwidget",igraphLayout = "layout_in_circle")
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。