Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            //DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt");

            String file       = args.Length > 0 ? args[0] : "data_w_right_ratings2014-05-02.csv";
            String outputfile = file.Split('.')[0] + "-weka.csv";

            Console.WriteLine("* Loading CSV-file (" + file + ")...");
            String[][] rawData = CSVParser.ReadDataFile(file, ";", "?");

            Console.WriteLine("* Parsing data...");
            List <DataLine> data = DataLine.ParseFixed(rawData);

            Console.WriteLine("* Discretize numeric values");
            DiscretizeValues(data);

            Console.WriteLine("* Adding extra parameters");
            AddSpielNominee(data);

            Console.WriteLine("* Expanding arrays to boolean parameters...");
            List <DataLine> wekaData = DivideLists(data);

            Console.WriteLine("* Writing games to Weka CSV-file (" + outputfile + ")...");
            WriteToFile(wekaData, outputfile);

            Console.WriteLine();
            Console.WriteLine("DONE");
            Console.ReadLine();
        }
Ejemplo n.º 2
0
        public static void MissingValues()
        {
            string[][] data = CSVParser.ReadDataFile("data_w_right_ratings2014-05-02.csv", ";", null);
            Console.WriteLine("Read datalines");

            DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt");
            Console.WriteLine("Read link file");

            List <DataLine> answers = DataLine.ParseFixed(data);

            foreach (string label in answers[0].hashDoubles.Keys)
            {
                int    c = answers.Count(line => line.hashDoubles[label] == 0);
                double p = Math.Round(c / (double)answers.Count * 100.0, 2);
                Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline");
            }

            foreach (string label in answers[0].hashStringArrays.Keys)
            {
                int    c = answers.Count(line => line.hashStringArrays[label] == null || line.hashStringArrays[label].Length == 0);
                double p = Math.Round(c / (double)answers.Count * 100.0, 2);
                Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline");
            }

            foreach (string label in answers[0].hashStrings.Keys)
            {
                int    c = answers.Count(line => line.hashStrings[label] == null || line.hashStrings[label].Length == 0);
                double p = Math.Round(c / (double)answers.Count * 100.0, 2);
                Console.WriteLine(); Console.WriteLine(label + "\t& " + c + "\t& " + p + "% \\\\ \\hline");
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Performs various Data Mining routines on the data located in the given .csv
        /// file. The actions are as follows:
        /// 1. Load data as 2-dim string array.
        /// 2. Infer types and create DataLine objects.
        /// 3. Normalize numerical values to the range of 0-1 (MinMaxNormalization)
        /// 4. Test classification on first string parameter and see hit rate (kNN classification)
        /// 5. Test clustering using all parameters (kMeans clustering)
        /// </summary>
        /// <param name="file">The filename of the input (without ".csv"). The results will be
        /// saved under the same name with "-output.txt" at the end.</param>
        private static void PerformDM(string file)
        {
            Console.WriteLine("Parsing started.");

            string[][] data = CSVParser.ReadDataFile(file, ";", null);
            Console.WriteLine("Read datalines");

            DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt");
            Console.WriteLine("Read link file");

            List <DataLine> answers = DataLine.ParseFixed(data);

            answers = answers.Take(20000).ToList();

            List <DataLine> historical = DataLine.ParseHistorical(CSVParser.ReadDataFile("data2014-04-09_09-11-52-historical.csv", ";", null)).ToList();

            Console.WriteLine("Historical Loaded");

            Console.WriteLine("Parsing Complete.\n");

            // create output after successful parsing
            TextWriter output = Console.Out;

            DataMining.BackPropagation(historical);

            //DataMining.minMaxNormalize(answers);

            /*
             * for (int i = 0; i < answers.Count; i++)
             * {
             *  Print(output,answers[i]);
             * }
             *
             * int correct = 0;
             * for (int i = 0; i < answers.Count; i++)
             * {
             *  string key = answers[i].hashStrings.Keys.First();
             *  string guessed = DataMining.kNN(answers.Where(a => !a.Equals(answers[i])).ToList(), answers[i], key, 3);
             *  Print(output,"os: real=" + answers[i].hashStrings[key] + " kNN: " + guessed);
             *  if (answers[i].hashStrings[key] != null && (answers[i].hashStrings[key].Contains(guessed) || guessed.Contains(answers[i].hashStrings[key])))
             *      correct++;
             * }
             * Print(output,"= " + correct + "/" + answers.Count + " guessed right.");
             *
             * // KMeans
             * List<KMeanCluster> clusters = DataMining.KMeansPartition(3, answers);
             * Print(output,"\nkMeans clustering: ");
             * for (int c = 0; c < clusters.Count; c++)
             * {
             *  Print(output,"Cluster #" + c);
             *  Print(output,clusters[c] + "\n");
             * }
             */
        }
Ejemplo n.º 4
0
        public static void BoxplotMe()
        {
            string[][] data = CSVParser.ReadDataFile("data_w_right_ratings2014-05-02.csv", ";", null);
            Console.WriteLine("Read datalines");

            DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt");
            Console.WriteLine("Read link file");

            List <DataLine> answers = DataLine.ParseFixed(data);

            string label = "min_age";
            var    tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);

            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 1, 100);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");

            label = "playingtime";
            tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 1, 1000);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");

            label = "min_players";
            tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 1, 20);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");

            label = "max_players";
            tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 1, 100);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");

            label = "year_published";
            tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 1900, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");

            label = "average_rating";
            tup   = BoxPlot(answers, label, int.MinValue, int.MaxValue);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
            tup = BoxPlot(answers, label, 0, 20);
            Console.WriteLine(); Console.WriteLine(label + " \\newline\n(" + tup.Item2 + " and " + tup.Item3 + " removed)\t\t\t& " + tup.Item1 + " \\\\ \\hline");
        }
Ejemplo n.º 5
0
        public static void FrequentPatternAnalysis()
        {
            double support    = .05;
            double confidence = .5;
            int    nbElements = 100000;

            string[][] data = CSVParser.ReadDataFile("data2014-04-03_03-35-14.csv", ";", null);
            Console.WriteLine("Read datalines");

            DataLine.linkDictionary = CSVParser.ReadLinkFile("linkIdNames.txt");
            Console.WriteLine("Read link file");

            List <DataLine> answers = DataLine.ParseFixed(data);

            answers = answers.Take(nbElements).ToList();
            var ageList    = new List <double>();
            var timeList   = new HashSet <double>();
            var ratingList = new HashSet <double>();
            int timeR      = 0;

            foreach (var dl in answers)
            {
                var min_age = dl.hashDoubles["min_age"];
                if (min_age != null && min_age <= 90)
                {
                    ageList.Add((double)min_age);
                }

                var playingtime = dl.hashDoubles["playingtime"];
                if (playingtime != null && playingtime <= 1000)
                {
                    timeList.Add((double)playingtime);
                }
                else
                {
                    timeR++;
                }

                var average_rating = dl.hashDoubles["average_rating"];
                if (average_rating != null)
                {
                    ratingList.Add((double)average_rating);
                }

                if (min_age > 90)
                {
                    Console.WriteLine(dl.hashDoubles["id"] + ": min_age: " + min_age);
                }
                if (playingtime > 1000)
                {
                    Console.WriteLine(dl.hashDoubles["id"] + ": playingtime: " + playingtime);
                }
            }

            /* Console.WriteLine("min_age");
             * foreach (var d in ageSet.OrderBy(d => d))
             * {
             *  Console.WriteLine("\t" + d);
             * }
             * Console.WriteLine("playingtime");
             * foreach (var d in timeSet.OrderBy(d => d))
             * {
             *  Console.WriteLine("\t" + d);
             * }*/
            var stopwatch = new Stopwatch();

            stopwatch.Start();
            // Apriori
            var aprioriLabels    = new string[] { "mechanics", "categories", "min_players", "max_players", "playingtime", "average_rating" };
            int supportThreshold = (int)(answers.Count * support);

            Console.WriteLine("Apriori with suppport: " + support);
            Console.WriteLine("Datalines: " + answers.Count);
            var patterns = DataMining.Apriori(answers, supportThreshold, aprioriLabels);

            patterns.Sort((tuple, tuple1) => tuple.Item2 - tuple1.Item2);
            foreach (Tuple <List <string>, int> list in patterns)
            {
                Console.WriteLine("Support: " + list.Item2 + " / " + Math.Round((100d * list.Item2) / answers.Count, 1) + "%: [" + string.Join(",", list.Item1.Select(DataLine.IDtoLabel)) + "]");
            }

            Console.WriteLine("Now doing association mining with confidence: " + confidence);
            //string aprioriLabel = "";

            // Assiciation Rules
            var ass = DataMining.AprioriAssociationRules(answers, patterns, confidence);

            ass.Sort((tuple, tuple1) => Math.Sign(tuple.Item4 - tuple1.Item4));

            foreach (var cheek in ass)
            {
                Console.WriteLine("Conf=" + Math.Round(cheek.Item3 * 100, 1) + "% lift=" + Math.Round(cheek.Item4, 2) + ": [" + string.Join(",", cheek.Item1.Select(DataLine.IDtoLabel)) + "] => \t[" + string.Join(",", cheek.Item2.Select(DataLine.IDtoLabel)) + "]");
            }
            Console.WriteLine("Done with frequent pattern analysis!");
            stopwatch.Stop();
            Console.WriteLine("Time: " + stopwatch.ElapsedMilliseconds);
        }