FPTree的一种java实现

关联分析是用来做什么的?这边有一个经典的例子“超市购物单”,文件market内容如下:

牛奶,鸡蛋,面包,薯片鸡蛋,爆米花,薯片,啤酒鸡蛋,面包,薯片牛奶,鸡蛋,面包,爆米花,薯片,啤酒牛奶,面包,啤酒鸡蛋,面包,啤酒牛奶,面包,薯片牛奶,鸡蛋,面包,黄油,薯片牛奶,鸡蛋,黄油,薯片每一行可以看作一个购物单,关联分析就是用来分析哪些物品经常会被同时购买(也就是关联度较大)。

其中一个分析算法的java实现如下:

import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.Set;public class FPTree {private int minSuport;public int getMinSuport() {return minSuport;}public void setMinSuport(int minSuport) {this.minSuport = minSuport;}// 从若干个文件中读入Transaction Recordpublic List<List<String>> readTransRocords(String… filenames) {List<List<String>> transaction = null;if (filenames.length > 0) {transaction = new LinkedList<List<String>>();for (String filename : filenames) {try {FileReader fr = new FileReader(filename);BufferedReader br = new BufferedReader(fr);try {String line;List<String> record;while ((line = br.readLine()) != null) {if (line.trim().length() > 0) {String str[] = line.split(",");record = new LinkedList<String>();for (String w : str)record.add(w);transaction.add(record);}}} finally {br.close();}} catch (IOException ex) {System.out.println("Read transaction records failed."+ ex.getMessage());System.exit(1);}}}return transaction;}// FP-Growth算法public void FPGrowth(List<List<String>> transRecords,List<String> postPattern) {// 构建项头表,同时也是频繁1项集ArrayList<TreeNode> HeaderTable = buildHeaderTable(transRecords);// 构建FP-TreeTreeNode treeRoot = buildFPTree(transRecords, HeaderTable);// 如果FP-Tree为空则返回if (treeRoot.getChildren() == null|| treeRoot.getChildren().size() == 0)return;// 输出项头表的每一项+postPatternif (postPattern != null) {for (TreeNode header : HeaderTable) {System.out.print(header.getCount() + "\t" + header.getName());for (String ele : postPattern)System.out.print("\t" + ele);System.out.println();}}// 找到项头表的每一项的条件模式基,进入递归迭代for (TreeNode header : HeaderTable) {// 后缀模式增加一项List<String> newPostPattern = new LinkedList<String>();newPostPattern.add(header.getName());if (postPattern != null)newPostPattern.addAll(postPattern);// 寻找header的条件模式基CPB,放入newTransRecords中List<List<String>> newTransRecords = new LinkedList<List<String>>();TreeNode backnode = header.getNextHomonym();while (backnode != null) {int counter = backnode.getCount();List<String> prenodes = new ArrayList<String>();TreeNode parent = backnode;// 遍历backnode的祖先节点,放到prenodes中while ((parent = parent.getParent()).getName() != null) {prenodes.add(parent.getName());}while (counter– > 0) {newTransRecords.add(prenodes);}backnode = backnode.getNextHomonym();}// 递归迭代FPGrowth(newTransRecords, newPostPattern);}}// 构建项头表,同时也是频繁1项集public ArrayList<TreeNode> buildHeaderTable(List<List<String>> transRecords) {ArrayList<TreeNode> F1 = null;if (transRecords.size() > 0) {F1 = new ArrayList<TreeNode>();Map<String, TreeNode> map = new HashMap<String, TreeNode>();// 计算事务数据库中各项的支持度for (List<String> record : transRecords) {for (String item : record) {if (!map.keySet().contains(item)) {TreeNode node = new TreeNode(item);node.setCount(1);map.put(item, node);} else {map.get(item).countIncrement(1);}}}// 把支持度大于(或等于)minSup的项加入到F1中Set<String> names = map.keySet();for (String name : names) {TreeNode tnode = map.get(name);if (tnode.getCount() >= minSuport) {F1.add(tnode);}}Collections.sort(F1);return F1;} else {return null;}}// 构建FP-Treepublic TreeNode buildFPTree(List<List<String>> transRecords,ArrayList<TreeNode> F1) {TreeNode root = new TreeNode(); // 创建树的根节点for (List<String> transRecord : transRecords) {LinkedList<String> record = sortByF1(transRecord, F1);TreeNode subTreeRoot = root;TreeNode tmpRoot = null;if (root.getChildren() != null) {while (!record.isEmpty()&& (tmpRoot = subTreeRoot.findChild(record.peek())) != null) {tmpRoot.countIncrement(1);subTreeRoot = tmpRoot;record.poll();}}addNodes(subTreeRoot, record, F1);}return root;}// 把交易记录按项的频繁程序降序排列public LinkedList<String> sortByF1(List<String> transRecord,ArrayList<TreeNode> F1) {Map<String, Integer> map = new HashMap<String, Integer>();for (String item : transRecord) {// 由于F1已经是按降序排列的,for (int i = 0; i < F1.size(); i++) {TreeNode tnode = F1.get(i);if (tnode.getName().equals(item)) {map.put(item, i);}}}ArrayList<Entry<String, Integer>> al = new ArrayList<Entry<String, Integer>>(map.entrySet());Collections.sort(al, new Comparator<Map.Entry<String, Integer>>() {@Overridepublic int compare(Entry<String, Integer> arg0,Entry<String, Integer> arg1) {// 降序排列return arg0.getValue() – arg1.getValue();}});LinkedList<String> rest = new LinkedList<String>();for (Entry<String, Integer> entry : al) {rest.add(entry.getKey());}return rest;}// 把record作为ancestor的后代插入树中public void addNodes(TreeNode ancestor, LinkedList<String> record,ArrayList<TreeNode> F1) {if (record.size() > 0) {while (record.size() > 0) {String item = record.poll();TreeNode leafnode = new TreeNode(item);leafnode.setCount(1);leafnode.setParent(ancestor);ancestor.addChild(leafnode);for (TreeNode f1 : F1) {if (f1.getName().equals(item)) {while (f1.getNextHomonym() != null) {f1 = f1.getNextHomonym();}f1.setNextHomonym(leafnode);break;}}addNodes(leafnode, record, F1);}}}public static void main(String[] args) {FPTree fptree = new FPTree();fptree.setMinSuport(4);List<List<String>> transRecords = fptree.readTransRocords(System.getProperty("user.dir") + "\\resource\\market");fptree.FPGrowth(transRecords, null);}}

可以看到main方法里,读market文件,指定要关联次数4次以上的物品,,

调用FPGrowth方法输出如下:

6薯片鸡蛋5薯片面包5鸡蛋面包4薯片鸡蛋面包5薯片牛奶5面包牛奶4鸡蛋牛奶4薯片面包牛奶4薯片鸡蛋牛奶

怠惰是贫穷的制造厂。

FPTree的一种java实现

相关文章:

你感兴趣的文章:

标签云: