Skip to content
Snippets Groups Projects
Unverified Commit 6c423952 authored by BARBIER Marc's avatar BARBIER Marc
Browse files

Refactoring and added support for required items in CM-SPAM

refactored to reduce the number of repeated code.
parent e25e5a16
No related branches found
No related tags found
No related merge requests found
......@@ -19,6 +19,7 @@ import be.uantwerpen.ldataminining.preprocessing.ArffUtils;
import be.uantwerpen.ldataminining.preprocessing.CSVUtils;
import be.uantwerpen.ldataminining.utils.CollectionUtils;
import be.uantwerpen.ldataminining.utils.IOUtils;
import be.uantwerpen.ldataminining.utils.Triple;
public class ArffToSPMF {
private ArffToSPMF(){}
......@@ -68,93 +69,20 @@ public class ArffToSPMF {
return new Pair<>(windowIdsSorted, groupByWindow);
}
static Pair<File,File> arffToSPMFTransactionalDatabase(File arff, List<String> columnsFiltered) throws IOException {
/**
*
* @param arff
* @param columnsFiltered
* @return Triple<DictFile, TransactionFile, dict>
* @throws IOException
*/
static Triple<File,File, Map<String,Integer>> arffToSPMF(boolean isSequence, File arff, List<String> columnsFiltered) throws IOException {
List<String> names = ArffUtils.getAttributeNames(arff);
List<List<String>> rows = ArffUtils.loadDataMatrix(arff);
System.out.format("toTransactionalDatabase: columns: %s\n".format(CollectionUtils.join(columnsFiltered)));
//1. make item representation
File dict = new File("./temp/", IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-dict"),"txt").getName());
Map<String,Integer> itemsSortedMap = oneHotEncodingItems(names,rows,columnsFiltered,dict);
//2. Group by window
Pair<TreeSet<Integer>, ListMap<Integer,List<String>>> pair = groupByWindow(names, rows);
TreeSet<Integer> windowIdsSorted = pair.getFirst();
ListMap<Integer,List<String>> groupByWindow = pair.getSecond();
//3. create table/grouped representation:
System.out.println("Creating transaction database. Nr of windows: " + windowIdsSorted.size());
List<String> transactions = new ArrayList<String>();
for(Integer window: windowIdsSorted) {
List<List<String>> row_group = groupByWindow.get(window);
Set<Integer> transaction = new TreeSet<>();
for(String column: columnsFiltered) {
int colIdx = names.indexOf(column);
for(List<String> row: row_group) {
String value = row.get(colIdx);
if(!CSVUtils.isEmptyValueInCSV(value)) {
Integer itemId = itemsSortedMap.get(String.format("%s=%s", column, value));
transaction.add(itemId);
}
}
}
transactions.add(CollectionUtils.join(transaction, " "));
}
File transactiondb = new File("./temp",
IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-transactions"),"txt").getName());
IOUtils.saveFile(transactions,transactiondb);
//DEBUG:
postConditionCheckTranslation(dict, transactions, transactiondb, names, itemsSortedMap, groupByWindow);
return new Pair<>(dict,transactiondb);
}
private static void postConditionCheckTranslation(File dict, List<String> transactions, File transactiondb, List<String> names, Map<String,Integer> itemsSortedMap, ListMap<Integer,List<String>> groupByWindow) throws IOException {
System.out.println("postConditionCheckTranslation:");
Map<String, String> itemToLabel = loadTranslation(dict);
for(int i=0; i<1; i++) {
System.out.println("Window " + i + ":");
List<List<String>> groupByRow = groupByWindow.get(i);
if(groupByRow == null)
return;
List<List<String>> series = new ArrayList<>();
System.out.println("Raw data:");
for(int col=0; col<names.size(); col++) {
List<String> valsCol = new ArrayList<>();
for(List<String> row: groupByRow) {
valsCol.add(row.get(col));
}
series.add(valsCol);
System.out.println(valsCol);
}
System.out.println("Raw data translated:");
int k=0;
for(List<String> serie: series) {
List<Integer> valsTranslated = new ArrayList<>();
for(String val: serie) {
Integer itemId = itemsSortedMap.get(String.format("%s=%s", names.get(k), val));
valsTranslated.add(itemId);
}
System.out.println(valsTranslated);
k++;
}
System.out.println("Transaction:");
System.out.println( transactions.get(i));
String[] transaction = transactions.get(i).split(" ");
String translated = "";
for(String transactionItem: transaction) {
translated += itemToLabel.get(transactionItem) + " ";
}
System.out.println("Translated back:");
System.out.println(translated);
}
}
static Pair<File,File> arffToSPMFSequenceDatabase(File arff, List<String> columnsFiltered) throws IOException {
List<String> names = ArffUtils.getAttributeNames(arff);
List<List<String>> rows = ArffUtils.loadDataMatrix(arff);
System.out.format(String.format("toSequenceDatabase: columns: %s%n", CollectionUtils.join(columnsFiltered)));
//1. make item representation
File dict = new File("./temp/", IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-dict"),"txt").getName());
// count
// generate dictionary by counting
Map<String,Integer> itemsSortedMap = oneHotEncodingItems(names,rows,columnsFiltered,dict);
//2. Group by window
Pair<TreeSet<Integer>, ListMap<Integer,List<String>>> pair = groupByWindow(names, rows);
......@@ -179,26 +107,43 @@ public class ArffToSPMF {
}
transaction.add(itemsetAtTimestampK);
}
if(isSequence) {
// SPMF format:
// 1 -1 1 2 3 -1 1 3 -1 4 -1 3 6 -1 -2
// The first line represents a sequence where the itemset {1} is followed by the itemset {1, 2, 3},
// followed by the itemset {1, 3}, followed by the itemset {4}, followed by the itemset {3, 6}
String s = "";
StringBuilder sb = new StringBuilder();
for(Set<Integer> set: transaction) {
if(set.isEmpty())
continue;
s += CollectionUtils.join(set, " ") + " -1 ";
sb.append(CollectionUtils.join(set, " "));
sb.append(" -1 ");
}
sb.append("-2");
transactions.add(sb.toString());
} else {
Set<Integer> transactionAsList = new TreeSet<>();
for(Set<Integer> set: transaction) {
transactionAsList.addAll(set);
}
s += "-2";
transactions.add(s);
transactions.add(CollectionUtils.join(transactionAsList, " "));
}
}
File transactiondb = new File("./temp",
IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-sequencedb"),"txt").getName());
isSequence ?
IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-sequencedb"),"txt").getName():
IOUtils.getFileWithDifferentExtension(IOUtils.getFileWithDifferentSuffix(arff, "-transactions"),"txt").getName()
);
IOUtils.saveFile(transactions, transactiondb);
return new Pair<>(dict,transactiondb);
return new Triple<>(dict, transactiondb, itemsSortedMap);
}
@SuppressWarnings("unchecked")
public static void saveOutputReadable(String datasetName, File dictFile, File outputSPMF, File outputCSVReadable)
throws IOException {
......
package be.uantwerpen.datamining.pattern_mining;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.tomcat.util.buf.StringUtils;
import be.uantwerpen.ldataminining.model.Pair;
import be.uantwerpen.ldataminining.utils.AlgorithmParameters;
import be.uantwerpen.ldataminining.utils.CollectionUtils;
import be.uantwerpen.ldataminining.utils.CommandLineUtils;
import be.uantwerpen.ldataminining.utils.IOUtils;
import be.uantwerpen.ldataminining.utils.Triple;
import be.uantwerpen.mime_webapp.Settings;
import ca.pfv.spmf.algorithmmanager.AlgorithmManager;
import ca.pfv.spmf.algorithmmanager.DescriptionOfParameter;
......@@ -19,14 +21,8 @@ public final class MineUsingSPMF {
private MineUsingSPMF(){}
public static File runMining(boolean isSequential, File arff, List<String> columns, String algorithm, Map<String, String> parametersMap) throws Exception {
String [] processedParameters = processParameters(algorithm, parametersMap);
Pair<File, File> dictFileTransFilePair;
if(isSequential) {
dictFileTransFilePair = ArffToSPMF.arffToSPMFSequenceDatabase(arff, columns);
} else {
dictFileTransFilePair = ArffToSPMF.arffToSPMFTransactionalDatabase(arff, columns);
}
Triple<File, File, Map<String,Integer>> dictFileTransFilePair = ArffToSPMF.arffToSPMF(isSequential, arff, columns);
String [] processedParameters = processParameters(algorithm, parametersMap, dictFileTransFilePair.getThird());
File log = new File("./temp/pattern_mining_" + arff.getName() + ".log");
File outputRaw = new File("./temp/" + algorithm + "_out_raw_" + arff.getName() + ".txt");
......@@ -39,7 +35,6 @@ public final class MineUsingSPMF {
// Add parameters to the command minsup HAVE to be included
String[] command = ArrayUtils.addAll(commandBase, processedParameters);
System.out.println("\n\n\n\n\n\n\n\n\n\n\n"+isSequential+"\n\n\n\n\n\n\n\n\n\n\n");
CommandLineUtils.runCommandInUserDir(command, log, Settings.SMPF_TIMEOUT);
IOUtils.printHead(log, 100);
String logStr = IOUtils.readFileFlat(log);
......@@ -62,15 +57,23 @@ public final class MineUsingSPMF {
return outputReadable;
}
public static File runItemsetMining(File arff, List<String> columns, String algorithm, Map<String, String> parametersMap) throws Exception {
return runMining(false, arff, columns, algorithm, parametersMap);
private static void preProcessParameters(Map<String, String> parametersMap, Map<String, Integer> idMap) {
if(parametersMap.containsKey(AlgorithmParameters.REQUIRED_ITEMS)) {
String requiredItems = parametersMap.get(AlgorithmParameters.REQUIRED_ITEMS);
String[] items = requiredItems.split(",");
List<String> itemsList = new ArrayList<>();
for(String item : items) {
if(idMap.containsKey(item)) {
itemsList.add(idMap.get(item).toString());
}
}
parametersMap.put(AlgorithmParameters.REQUIRED_ITEMS, CollectionUtils.join(itemsList, ","));
}
public static File runSequentialPatternMining(File arff, List<String> columns, String algorithm, Map<String, String> parametersMap) throws Exception {
return runMining(true, arff, columns, algorithm, parametersMap);
}
private static String[] processParameters(String algorithm, Map<String, String> parametersMap) throws Exception {
private static String[] processParameters(String algorithm, Map<String, String> parametersMap, Map<String, Integer> map) throws Exception {
preProcessParameters(parametersMap, map);
DescriptionOfParameter[] parametersDescriptions = AlgorithmManager.getInstance().getDescriptionOfAlgorithm(algorithm).getParametersDescription();
String[] processedParameters = new String[parametersDescriptions.length];
int parameterCount = 0;
......
......@@ -9,4 +9,8 @@ public class Triple<T1, T2, T3> extends ca.pfv.spmf.algorithms.sequentialpattern
public String toString() {
return String.format("<%s,%s,%s>", getFirst(), getSecond(), getThirth());
}
public T3 getThird() {
return super.getThirth();
}
}
......@@ -71,12 +71,7 @@ public class PatternMiningController extends AbstractController{
File data = currentInput.getFile();
File outputReadable = null;
if(isItemset) {
outputReadable = MineUsingSPMF.runItemsetMining(data, Arrays.asList(columnsArr), algorithm, parameterMap);
} else {
outputReadable = MineUsingSPMF.runSequentialPatternMining(data, Arrays.asList(columnsArr), algorithm, parameterMap);
}
File outputReadable = MineUsingSPMF.runMining(!isItemset, data, Arrays.asList(columnsArr), algorithm, parameterMap);
Project newProject = savePatterns(request, id, data, outputReadable, "sequential patterns", columns, algorithm, parameterMap.get(AlgorithmParameters.MINSUP).replace("%", ""));
final FileItem newInput = repository.getLatestItem(newProject, currentInput.getLogicalName());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment