前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >数据挖掘之聚类算法Apriori总结

数据挖掘之聚类算法Apriori总结

作者头像
Gxjun
发布2018-03-27 12:11:19
1K0
发布2018-03-27 12:11:19
举报
文章被收录于专栏:ml

项目中有时候需要用到对数据进行关联分析,比如分析一个小商店中顾客购买习惯.

代码语言:javascript
复制
  1 package com.data.algorithm;
  2 
  3 import com.google.common.base.Splitter;
  4 import com.google.common.collect.Lists;
  5 import com.google.common.collect.Maps;
  6 import org.slf4j.Logger;
  7 import org.slf4j.LoggerFactory;
  8 
  9 import java.io.BufferedReader;
 10 import java.io.FileInputStream;
 11 import java.io.IOException;
 12 import java.io.InputStreamReader;
 13 import java.util.*;
 14 
 15 /**
 16  * *********************************************************
 17  * <p/>
 18  * Author:     XiJun.Gong
 19  * Date:       2017-01-20 15:06
 20  * Version:    default 1.0.0
 21  * Class description:
 22  * <p/>
 23  * *********************************************************
 24  */
 25 
 26 class EOC {
 27 
 28     private static final Logger logger = LoggerFactory.getLogger(EOC.class);
 29     private Map<String, Integer> fmap;  //forward map
 30     private Map<Integer, String> bmap;  //backward map
 31     private List<Map<String, Integer>> elements = null;
 32 
 33     private Integer maxDimension;
 34 
 35     public EOC(final String pathFile, String separatSeq) {
 36 
 37         BufferedReader bufferedReader = null;
 38         try {
 39             this.fmap = Maps.newHashMap();
 40             this.bmap = Maps.newHashMap();
 41             this.elements = Lists.newArrayList();
 42             maxDimension = 0;
 43             bufferedReader = new BufferedReader(
 44                     new InputStreamReader(
 45                             new FileInputStream(pathFile), "UTF-8"));
 46             String _line = null;
 47             Integer keyValue = null, mapIndex = 0;
 48             while ((_line = bufferedReader.readLine()) != null) {
 49                 Map<String, Integer> lineMap = Maps.newHashMap();
 50                 if (_line.trim().length() > 1) {
 51                     if (separatSeq.trim().length() < 1) {
 52                         separatSeq = ",";
 53                     }
 54                     for (String word : Splitter.on(separatSeq).split(_line)) {
 55                         word = word.trim();
 56                         if (null == (keyValue = fmap.get(word))) {
 57                             keyValue = mapIndex++;
 58                         }
 59                         fmap.put(word, keyValue);
 60                         bmap.put(keyValue, word);
 61                         lineMap.put(word, keyValue);
 62                     }
 63                     if (maxDimension < lineMap.size())
 64                         maxDimension = lineMap.size();
 65                     elements.add(lineMap);
 66                 }
 67             }
 68         } catch (Exception e) {
 69             logger.error("读取文件出错 , 错误原因:{}", e);
 70         } finally {
 71             if (bufferedReader != null) {
 72                 try {
 73                     bufferedReader.close();
 74                 } catch (IOException e) {
 75                     logger.error("bufferedReader , 错误原因:{}", e);
 76                 }
 77             }
 78         }
 79     }
 80 
 81     public Integer getMaxDimension() {
 82         return maxDimension;
 83     }
 84 
 85     public float getRateOfSet(Collection<Integer> elementChild) {
 86         float rateCnt = 0f;
 87         int allSize = 1;
 88         for (Map<String, Integer> eMap : elements) {
 89             boolean flag = true;
 90             for (Integer element : elementChild) {
 91                 if (null == eMap.get(bmap.get(element))) {
 92                     flag = false;
 93                     break;
 94                 }
 95             }
 96             if (flag) rateCnt += 1;
 97         }
 98         return rateCnt / ((allSize = elements.size()) > 1 ? (float) allSize : 1.0f);
 99     }
100 
101     public Set<Integer> getElements() {
102 
103         return new HashSet<Integer>(fmap.values());
104     }
105 
106     public Integer queryByKey(String key) {
107         return fmap.get(key);
108     }
109 
110     public String queryByValue(Integer value) {
111         return bmap.get(value);
112     }
113 }
114 
115 public class Apriori {
116     private static final Logger logger = LoggerFactory.getLogger(Apriori.class);
117     private EOC eoc = null;
118     private Integer maxDimension;
119     private final float exp = 1e-4f;
120 
121     public Apriori(final String pathFile, String separatSeq, Integer maxDimension) {
122         this(pathFile, separatSeq);
123         this.maxDimension = maxDimension;
124     }
125 
126     public Apriori(final String pathFile, String separatSeq) {
127         this.eoc = new EOC(pathFile, separatSeq);
128         this.maxDimension = this.eoc.getMaxDimension();
129     }
130 
131     public void work(float confidenceLevel) {
132         List<Set<Integer>> listElement = null;
133         ArrayList<Set<Integer>> middleWareElement = null;
134         Map<Set<Integer>, Float> maps = null;
135         listElement = Lists.newArrayList();
136         for (Integer element : this.eoc.getElements()) {
137             Set<Integer> set = new HashSet<Integer>();
138             set.add(element);
139             listElement.add(set);
140         }
141         maps = Maps.newHashMap();
142         middleWareElement = Lists.newArrayList();
143         for (int i = 1; i < this.maxDimension; i++) {
144             for (Set<Integer> tmpSet : listElement) {
145                 float rate = eoc.getRateOfSet(tmpSet);
146                 if (confidenceLevel - exp <= rate)
147                     maps.put(tmpSet, rate);
148             }
149             System.out.println("+++++++++++第 " + i + " 维度关联数据+++++++++++");
150             output(maps);
151             listElement.clear();
152             middleWareElement.addAll(maps.keySet());
153             maps.clear();
154             for (int j = 0; j < middleWareElement.size(); j++) {
155                 Set<Integer> tmpSet = middleWareElement.get(j);
156                 for (int k = j + 1; k < middleWareElement.size(); k++) {
157                     Set<Integer> setChild = middleWareElement.get(k);
158                     for (Integer label : setChild) {
159                         if (!tmpSet.contains(label)) {
160                             Set<Integer> newElement = new HashSet<Integer>(tmpSet);
161                             newElement.add(label);
162                             if (!listElement.contains(newElement)) {
163                                 listElement.add(newElement);
164                                 break;
165                             }
166                         }
167                     }
168                 }
169             }
170             middleWareElement.clear();
171         }
172     }
173 
174     public void output(Map<Set<Integer>, Float> maps) {
175         for (Map.Entry<Set<Integer>, Float> iter : maps.entrySet()) {
176             for (Integer integer : iter.getKey()) {
177                 System.out.print(eoc.queryByValue(integer) + " ");
178             }
179             System.out.println(iter.getValue()*100+"%");
180         }
181     }
182 }
代码语言:javascript
复制
 1 package com.data.algorithm;
 2 
 3 
 4 /**
 5  * *********************************************************
 6  * <p/>
 7  * Author:     XiJun.Gong
 8  * Date:       2017-01-17 17:57
 9  * Version:    default 1.0.0
10  * Class description:
11  * <p/>
12  * *********************************************************
13  */
14 public class Main {
15     public static void main(String args[]) {
16         Apriori apriori = new Apriori("/home/com/src/main/java/com/qunar/data/algorithm/demo.data", ",");
17         apriori.work(0.5f);
18     }
19 }
代码语言:javascript
复制
 1 +++++++++++第 1 维度关联数据+++++++++++
 2 苹果 50.0%
 3 西红柿 75.0%
 4 香蕉 75.0%
 5 矿泉水 75.0%
 6 +++++++++++第 2 维度关联数据+++++++++++
 7 苹果 西红柿 50.0%
 8 西红柿 香蕉 50.0%
 9 西红柿 矿泉水 50.0%
10 香蕉 矿泉水 75.0%
11 +++++++++++第 3 维度关联数据+++++++++++
12 西红柿 香蕉 矿泉水 50.0%
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2017-01-20 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档