static void Main(string[] args) { //DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt"); String file = args.Length > 0 ? args[0] : "data_w_right_ratings2014-05-02.csv"; String outputfile = file.Split('.')[0] + "-weka.csv"; Console.WriteLine("* Loading CSV-file (" + file + ")..."); String[][] rawData = CSVParser.ReadDataFile(file, ";", "?"); Console.WriteLine("* Parsing data..."); List <DataLine> data = DataLine.ParseFixed(rawData); Console.WriteLine("* Discretize numeric values"); DiscretizeValues(data); Console.WriteLine("* Adding extra parameters"); AddSpielNominee(data); Console.WriteLine("* Expanding arrays to boolean parameters..."); List <DataLine> wekaData = DivideLists(data); Console.WriteLine("* Writing games to Weka CSV-file (" + outputfile + ")..."); WriteToFile(wekaData, outputfile); Console.WriteLine(); Console.WriteLine("DONE"); Console.ReadLine(); }
public static void MissingValues() { string[][] data = CSVParser.ReadDataFile("data_w_right_ratings2014-05-02.csv", ";", null); Console.WriteLine("Read datalines"); DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt"); Console.WriteLine("Read link file"); List <DataLine> answers = DataLine.ParseFixed(data); foreach (string label in answers[0].hashDoubles.Keys) { int c = answers.Count(line => line.hashDoubles[label] == 0); double p = Math.Round(c / (double)answers.Count * 100.0, 2); Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline"); } foreach (string label in answers[0].hashStringArrays.Keys) { int c = answers.Count(line => line.hashStringArrays[label] == null || line.hashStringArrays[label].Length == 0); double p = Math.Round(c / (double)answers.Count * 100.0, 2); Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline"); } foreach (string label in answers[0].hashStrings.Keys) { int c = answers.Count(line => line.hashStrings[label] == null || line.hashStrings[label].Length == 0); double p = Math.Round(c / (double)answers.Count * 100.0, 2); Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline"); } }
/// <summary> /// Performs various Data Mining routines on the data located in the given .csv /// file. The actions are as follows: /// 1. Load data as 2-dim string array. /// 2. Infer types and create DataLine objects. /// 3. Normalize numerical values to the range of 0-1 (MinMaxNormalization) /// 4. Test classification on first string parameter and see hit rate (kNN classification) /// 5. Test clustering using all parameters (kMeans clustering) /// </summary> /// <param name="file">The filename of the input (without ".csv"). The results will be /// saved under the same name with "-output.txt" at the end.</param> private static void PerformDM(string file) { Console.WriteLine("Parsing started."); string[][] data = CSVParser.ReadDataFile(file, ";", null); Console.WriteLine("Read datalines"); DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt"); Console.WriteLine("Read link file"); List <DataLine> answers = DataLine.ParseFixed(data); answers = answers.Take(20000).ToList(); List <DataLine> historical = DataLine.ParseHistorical(CSVParser.ReadDataFile("data2014-04-09_09-11-52-historical.csv", ";", null)).ToList(); Console.WriteLine("Historical Loaded"); Console.WriteLine("Parsing Complete.\n"); // create output after successful parsing TextWriter output = Console.Out; DataMining.BackPropagation(historical); //DataMining.minMaxNormalize(answers); /* * for (int i = 0; i < answers.Count; i++) * { * Print(output,answers[i]); * } * * int correct = 0; * for (int i = 0; i < answers.Count; i++) * { * string key = answers[i].hashStrings.Keys.First(); * string guessed = DataMining.kNN(answers.Where(a => !a.Equals(answers[i])).ToList(), answers[i], key, 3); * Print(output,"os: real=" + answers[i].hashStrings[key] + " kNN: " + guessed); * if (answers[i].hashStrings[key] != null && (answers[i].hashStrings[key].Contains(guessed) || guessed.Contains(answers[i].hashStrings[key]))) * correct++; * } * Print(output,"= " + correct + "/" + answers.Count + " guessed right."); * * // KMeans * List<KMeanCluster> clusters = DataMining.KMeansPartition(3, answers); * Print(output,"\nkMeans clustering: "); * for (int c = 0; c < clusters.Count; c++) * { * Print(output,"Cluster #" + c); * Print(output,clusters[c] + "\n"); * } */ }
public static void BoxplotMe() { string[][] data = CSVParser.ReadDataFile("data_w_right_ratings2014-05-02.csv", ";", null); Console.WriteLine("Read datalines"); DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt"); Console.WriteLine("Read link file"); List <DataLine> answers = DataLine.ParseFixed(data); string label = "min_age"; var tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 1, 100); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); label = "playingtime"; tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 1, 1000); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); label = "min_players"; tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 1, 20); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); label = "max_players"; tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 1, 100); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); label = "year_published"; tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 1900, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); label = "average_rating"; tup = BoxPlot(answers, label, int.MinValue, int.MaxValue); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); tup = BoxPlot(answers, label, 0, 20); Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline"); }
public static void FrequentPatternAnalysis() { double support = .05; double confidence = .5; int nbElements = 100000; string[][] data = CSVParser.ReadDataFile("data2014-04-03_03-35-14.csv", ";", null); Console.WriteLine("Read datalines"); DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt"); Console.WriteLine("Read link file"); List <DataLine> answers = DataLine.ParseFixed(data); answers = answers.Take(nbElements).ToList(); var ageList = new List <double>(); var timeList = new HashSet <double>(); var ratingList = new HashSet <double>(); int timeR = 0; foreach (var dl in answers) { var min_age = dl.hashDoubles["min_age"]; if (min_age != null && min_age <= 90) { ageList.Add((double)min_age); } var playingtime = dl.hashDoubles["playingtime"]; if (playingtime != null && playingtime <= 1000) { timeList.Add((double)playingtime); } else { timeR++; } var average_rating = dl.hashDoubles["average_rating"]; if (average_rating != null) { ratingList.Add((double)average_rating); } if (min_age > 90) { Console.WriteLine(dl.hashDoubles["id"] + ": min_age: " + min_age); } if (playingtime > 1000) { Console.WriteLine(dl.hashDoubles["id"] + ": playingtime: " + playingtime); } } /* Console.WriteLine("min_age"); * foreach (var d in ageSet.OrderBy(d => d)) * { * Console.WriteLine("\t" + d); * } * Console.WriteLine("playingtime"); * foreach (var d in timeSet.OrderBy(d => d)) * { * Console.WriteLine("\t" + d); * }*/ var stopwatch = new Stopwatch(); stopwatch.Start(); // Apriori var aprioriLabels = new string[] { "mechanics", "categories", "min_players", "max_players", "playingtime", "average_rating" }; int supportThreshold = (int)(answers.Count * support); Console.WriteLine("Apriori with suppport: " + support); Console.WriteLine("Datalines: " + answers.Count); var patterns = DataMining.Apriori(answers, supportThreshold, aprioriLabels); patterns.Sort((tuple, tuple1) => tuple.Item2 - tuple1.Item2); foreach (Tuple <List <string>, int> list in patterns) { Console.WriteLine("Support: " + list.Item2 + " / " + Math.Round((100d * list.Item2) / answers.Count, 1) + "%: [" + string.Join(",", list.Item1.Select(DataLine.IDtoLabel)) + "]"); } Console.WriteLine("Now doing association mining with confidence: " + confidence); //string aprioriLabel = ""; // Assiciation Rules var ass = DataMining.AprioriAssociationRules(answers, patterns, confidence); ass.Sort((tuple, tuple1) => Math.Sign(tuple.Item4 - tuple1.Item4)); foreach (var cheek in ass) { Console.WriteLine("Conf=" + Math.Round(cheek.Item3 * 100, 1) + "% lift=" + Math.Round(cheek.Item4, 2) + ": [" + string.Join(",", cheek.Item1.Select(DataLine.IDtoLabel)) + "] => \t[" + string.Join(",", cheek.Item2.Select(DataLine.IDtoLabel)) + "]"); } Console.WriteLine("Done with frequent pattern analysis!"); stopwatch.Stop(); Console.WriteLine("Time: " + stopwatch.ElapsedMilliseconds); }