Apriori关联分析学习(arules包)

原创

凑齐六个字吧

发布于 2025-07-14 23:41:52

1480

文章被收录于专栏：临床预测模型临床预测模型

R语言中的arules包系列为表示、处理和分析事务数据与模式提供了基础架构，基于频繁项集(frequent itemsets)和关联规则(association rules)进行挖掘。

该包不仅实现了多种兴趣度量指标和关联规则挖掘算法，还集成了Christian Borgelt提供的高效Apriori和Eclat C语言实现版本的代码。

分析流程

1.导入

rm(list=ls())
library(arulesViz)
library(arules)
data("IncomeESL")

trans <- transactions(IncomeESL)
trans

2.数据预处理

rules <- apriori(trans, supp = 0.1, conf = 0.9, target = "rules")
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 899 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[84 item(s), 8993 transaction(s)] done [0.00s].
## sorting and recoding items ... [42 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [457 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

3.频率统计和关联

trans <- as(trans, "transactions")
print(trans)

## 可视化频率top30的项目
par(family = "STKaiti",cex = 1)
itemFrequencyPlot(trans,
                  top = 30,
                  col = "lightblue",
                  xlab = "samples",                  
                  ylab = "Frequency",
                  main = "")


inspect(head(rules, n = 3, by = "lift"))
#     lhs                           rhs                        support confidence  coverage     lift count
# [1] {dual incomes=no,                                                                                   
#      householder status=own}   => {marital status=married} 0.1016346  0.9713071 0.1046369 2.619965   914
# [2] {years in bay area=>10,                                                                             
#      dual incomes=yes,                                                                                  
#      type of home=house}       => {marital status=married} 0.1003002  0.9605964 0.1044145 2.591075   902
# [3] {dual incomes=yes,                                                                                  
#      householder status=own,                                                                            
#      type of home=house,                                                                                
#      language in home=english} => {marital status=married} 0.1098632  0.9601555 0.1144223 2.589886   988
inspectDT(sort(rules,by = "lift"))
## 对得出的关联规则按照置信度进行排序，也可根据需要根据支持度或者提升度进行排序
myrules.a <- sort(rules, by = "confidence", decreasing = T ) 
## 排序后查看结果
inspect(myrules.a[1:20])

提取前30的项目

交互式可视化展示

4.提取目标

# 设置目标名（可以是多个），进行筛选
# target <- c("occupation=student","number in household=4",
#             "householder status=live with parents/family",
#             "type of home=apartment",
#             "occupation=professional/managerial",
#             "householder status=own",
#             "dual incomes=not married")

# lhs_labels <- labels(lhs(myrules.a))
# rhs_labels <- labels(rhs(myrules.a))
# # 拆分字符串为项目向量
# split_items <- function(x) {
#   x <- gsub("[{}]", "", x)          # 去掉大括号
#   x <- unlist(strsplit(x, ","))     # 按逗号拆分
#   x <- trimws(x)                    # 去除前后空格
#   return(x)
# }
# # 应用到所有规则
# lhs_items <- lapply(lhs_labels, split_items)
# rhs_items <- lapply(rhs_labels, split_items)
# 
# 
# # 过滤函数：检查每条规则的 lhs/rhs 是否只包含目标名
# is_all_in_target <- function(items, herbs) {
#   sapply(items, function(x) all(x %in% herbs))
# }
# 
# # 找出 lhs 和 rhs 都完全在目标名称列表中的规则索引
# valid_lhs <- is_all_in_target(lhs_items, target)
# valid_rhs <- is_all_in_target(rhs_items, target)
# valid_rules_index <- which(valid_lhs & valid_rhs)
# 
# # 提取规则
# filtered_rules <- myrules.a[valid_rules_index]
# 
# # 查看提取到的规则
# inspect(filtered_rules)
# 
# write.csv(as(filtered_rules, "data.frame"),
#           file = "筛选后目标样本-关联规则.csv", row.names = FALSE)


### 假设不进行过滤
lhs_labels <- labels(lhs(myrules.a))
rhs_labels <- labels(rhs(myrules.a))
rules_df <- as(myrules.a, "data.frame")
head(rules_df)
write.csv(rules_df, "所有关联规则.csv", row.names = FALSE)

5.可视化

library(arulesViz)
plot(myrules.a)
#plot(myrules.a, control = list(jitter = 0))
plot(myrules.a, shading = "order")
plot(myrules.a, method = "grouped")
plot(myrules.a, method = "graph")
plot(myrules.a, method="graph",layout=igraph::in_circle())
plot(myrules.a, method="paracoord")
## 使用可交互的网络图可视化获得的规则
plot(myrules.a, method="graph",engine="htmlwidget",igraphLayout = "layout_in_circle")