Exemplo n.º 1
0
        /// <summary>
        /// Similar to <see cref="BuildTransitionFrequencyCounts"/>, but instead creates a single file
        /// per transition length for all students and all loaded files.
        /// </summary>
        /// <param name="vm"></param>
        private void AggregateTransitionFrequencyCounts(TimelineAnalysisViewModel vm)
        {
            //step 1: get list of files to process
            List <string> filesToProcess = new List <string>();
            string        fileName       = "a";

            Console.WriteLine("Enter files to process (-1 to stop)");
            while ((fileName = GetFile()).Length > 0)
            {
                filesToProcess.Add(fileName);
            }

            //load all data into VM
            vm.LoadTimeline(filesToProcess[0]);
            for (int i = 1; i < filesToProcess.Count; i++)
            {
                vm.AppendTimeline(filesToProcess[i]);
            }

            //step2: get sequence boundaries.  Again, hard coded for now
            int startingSequenceLength = 2;
            int endingSequenceLength   = 25;

            //this produces a lot of files, so create a separate directory for the output
            string outputDirectory = "AggregateTransitionFrequencyCounts";

            if (Directory.Exists(outputDirectory) == false)
            {
                Directory.CreateDirectory(outputDirectory);
            }

            /*
             * What I need to do:
             * Get all sequences.
             * For each sequence:
             *      Determine if similar to other known sequences.  If so, combine into same set. (disjoint set?)
             * */

            Dictionary <int, Dictionary <string, int> > allTransitions = new Dictionary <int, Dictionary <string, int> >();

            //begin file processing
            for (int sequenceLength = startingSequenceLength; sequenceLength <= endingSequenceLength; sequenceLength++)
            {
                //get grade data
                vm.AttachGrades();

                //build markov transitions
                vm.BuildDefaultMarkovStates();

                //figure out sequence distribution for entire data set and for individual students
                Dictionary <string, int> transitions = vm.GetAllTransitionCombinations(sequenceLength);

                //filter out singletons
                var smallKeys = transitions.Where(t => t.Value < 5).Select(t => t.Key).ToList();
                foreach (string key in smallKeys)
                {
                    transitions.Remove(key);
                }

                //save for future use
                allTransitions.Add(sequenceLength, transitions);

                Console.WriteLine("Loaded transitions of length {0}.", sequenceLength);
            }

            //use Needleman-Wunsch algorithm and disjoint sets to combine similar sequences
            DisjointSet <string> matches = new DisjointSet <string>();
            int matchCount = 0;

            //start with large sequences as it will make it more likely that these will be the "top" of the disjoint set
            int startingNumber = (int)'a';

            for (int sequenceLength = endingSequenceLength; sequenceLength >= startingSequenceLength; sequenceLength--)
            {
                Console.WriteLine("Matching sequences of length {0}", sequenceLength);

                //Needleman-Wunsch works on single characters, so we need to transform Markov-like numbers to letters
                Dictionary <string, int> originalSequences = allTransitions[sequenceLength];
                Dictionary <string, int> modifiedSequences = new Dictionary <string, int>();
                foreach (var kvp in originalSequences)
                {
                    //convert into numbers
                    int[] pieces = kvp.Key.Split('_').Select(k => Convert.ToInt32(k) + startingNumber).ToArray();

                    //then, convert back to characters
                    char[] sequence = pieces.Select(p => Convert.ToChar(p)).ToArray();

                    //and finally into a string
                    string charSequence = string.Join("_", sequence);

                    //lastly, remember this sequence
                    modifiedSequences.Add(charSequence, kvp.Value);
                }

                //prime the disjoint set
                foreach (string key in modifiedSequences.Keys)
                {
                    matches.Find(key);
                }

                //having converted to character state representations, now run the Needleman-Wunsch algorithm
                List <string> sequences = modifiedSequences.Keys.ToList();
                for (int i = 0; i < sequences.Count; i++)
                {
                    for (int j = i + 1; j < sequences.Count; j++)
                    {
                        string first  = matches.Find(sequences[i]);
                        string second = matches.Find(sequences[j]);

                        //automatically count sequences as the same when one sequence is a complete substring of another sequence
                        string firstSequence  = sequences[i];
                        string secondSequence = sequences[j];
                        if (firstSequence.Replace(secondSequence, "").Length == 0 ||
                            secondSequence.Replace(firstSequence, "").Length == 0
                            )
                        {
                            matches.UnionWith(first, second);
                            matchCount++;
                        }
                        else
                        {
                            //Use NW to check for alignment
                            //align the two sequences
                            var result = NeedlemanWunsch.Align(first, second);

                            //if score is similar, then count the sequences as the same (union)
                            if ((double)NeedlemanWunsch.ScoreNpsmSequence(result.Item1, result.Item2) < 3)
                            {
                                matches.UnionWith(first, second);
                                matchCount++;
                            }
                        }
                    }
                }
            }

            //now, get all sets and figure out popularity of each set
            Console.WriteLine("{0} unions performed.", matchCount);
            List <List <string> >    allSets        = matches.AllSets();
            List <List <string> >    smallerSets    = allSets.Where(s => s.Count > 1).ToList();
            Dictionary <string, int> popularityDict = new Dictionary <string, int>();

            Console.WriteLine("Calculating popularity of {0} sets...", allSets.Count);
            foreach (List <string> set in allSets)
            {
                foreach (string item in set)
                {
                    //convert back to Markov-style transitions
                    int[]  pieces = item.Split('_').Select(c => Convert.ToChar(c)).Select(c => (int)c - startingNumber).ToArray();
                    string key    = string.Join("_", pieces);

                    if (popularityDict.ContainsKey(key) == false)
                    {
                        popularityDict.Add(key, 0);
                    }

                    //add in counts to the popularity dictionary
                    popularityDict[key] += allTransitions[pieces.Length][key];
                }
            }

            //write this information to a file
            CsvWriter writer = new CsvWriter();

            //aggregate class results
            Console.WriteLine("Writing most popular sequences to file.");
            foreach (KeyValuePair <string, int> kvp in popularityDict.OrderByDescending(p => p.Value))
            {
                int[]  pieces  = kvp.Key.Split('_').Select(c => Convert.ToInt32(c)).ToArray();
                string npsmKey = string.Join("_", pieces.Select(p => vm.StateNumberToNpsmString(p)).ToArray());

                writer.AddToCurrentLine(npsmKey);
                writer.AddToCurrentLine(kvp.Value.ToString());
                writer.CreateNewRow();
            }
            using (TextWriter tw = File.CreateText(string.Format("popular_sequences.csv")))
            {
                tw.Write(writer.ToString());
            }
        }
Exemplo n.º 2
0
        /*
         * What I want to do:
         *  For each assignment:
         *      figure out common sequences of length m to n
         *      For each student, for each grade band (A-F), again determine frequences of length m to n
         *      Build a frequency distribution for each grade band by sequence
         * */
        private void BuildTransitionFrequencyCounts(TimelineAnalysisViewModel vm)
        {
            //step 1: get list of files to process
            List <string> filesToProcess = new List <string>();
            string        fileName       = "a";

            Console.WriteLine("Enter files to process (-1 to stop)");
            while ((fileName = GetFile()).Length > 0)
            {
                filesToProcess.Add(fileName);
            }

            //step 2: setup grade-bands (e.g. A, B, C, etc.)  Hard coded for now as this is just for a
            //single class
            double maxScore = 200;

            double[] gradeRanges = { 90, 78, 69, 60, 0 };
            string[] gradeMap    = { "A", "B", "C", "D", "F" };

            //step 3: get sequence boundaries.  Again, hard coded for now
            int startingSequenceLength = 2;
            int endingSequenceLength   = 25;

            //step 4: get assignments.
            string[] assignments = { "Assignment #1",
                                     "Assignment #2",
                                     "Assignment #3",
                                     "Assignment #4",
                                     "Assignment #5",
                                     "Assignment #6",
                                     "Assignment #7" };
            int      assignmentCounter = 0;

            //this produces a lot of files, so create a separate directory for the output
            string outputDirectory = "TransitionFrequencyCounts";

            if (Directory.Exists(outputDirectory) == false)
            {
                Directory.CreateDirectory(outputDirectory);
            }


            //finally, begin processing
            foreach (string fileToProcess in filesToProcess)
            {
                string folderName = fileToProcess.Replace("#", "");
                string outputPath = Path.Combine(outputDirectory, folderName);
                if (Directory.Exists(outputPath) == false)
                {
                    Directory.CreateDirectory(outputPath);
                }
                for (int sequenceLength = startingSequenceLength; sequenceLength <= endingSequenceLength; sequenceLength++)
                {
                    //reset max score for A students
                    maxScore = 200;

                    //based on currently existing code, it is easier to reopen the file for
                    //each grade range
                    for (int i = 0; i < gradeRanges.Length; i++)
                    {
                        double bound = gradeRanges[i];

                        //reload the file
                        LoadFile(fileToProcess);

                        //get grade data
                        vm.AttachGrades();

                        //filter based on grade data
                        vm.FilterByGrade(assignments[assignmentCounter], bound, maxScore);

                        //update scores for next grade boundary
                        maxScore = bound - 0.01;

                        //build markov transitions
                        vm.BuildDefaultMarkovStates();

                        //figure out sequence distribution for entire data set and for individual students
                        Dictionary <string, int> transitions = vm.GetAllTransitionCombinations(sequenceLength);

                        //interesting transitions are those in which we have at least 5 occurrances
                        var interestingTransitions = transitions.Where(t => t.Value > 5).OrderBy(t => t.Value).ToList();

                        //write this information to a file
                        CsvWriter writer = new CsvWriter();

                        //aggregate class results
                        Console.WriteLine("Processing transition sequences of length {0}...", sequenceLength);
                        foreach (KeyValuePair <string, int> kvp in interestingTransitions)
                        {
                            writer.AddToCurrentLine(kvp.Key);
                            writer.AddToCurrentLine(kvp.Value.ToString());
                            writer.CreateNewRow();
                        }
                        using (TextWriter tw = File.CreateText(string.Format("{0}/aggregate_{1}_{2}.csv", outputPath, sequenceLength, gradeMap[i])))
                        {
                            tw.Write(writer.ToString());
                        }

                        //individual students
                        //add header data
                        writer = new CsvWriter();
                        writer.AddToCurrentLine("UserId");
                        writer.AddToCurrentLine("Grade");
                        foreach (var kvp in interestingTransitions)
                        {
                            writer.AddToCurrentLine(kvp.Key);
                        }
                        writer.CreateNewRow();

                        foreach (var user in vm.Timeline.Values)
                        {
                            //first row for users is raw values
                            writer.AddToCurrentLine(user.OsbideId);
                            writer.AddToCurrentLine(gradeMap[i]);

                            //only use the interesting states as columns as identified in the aggregate analysis
                            foreach (KeyValuePair <string, int> kvp in interestingTransitions)
                            {
                                if (user.TransitionCounts.ContainsKey(kvp.Key) == true)
                                {
                                    writer.AddToCurrentLine(user.TransitionCounts[kvp.Key]);
                                }
                                else
                                {
                                    writer.AddToCurrentLine("0");
                                }
                            }
                            writer.CreateNewRow();

                            //2nd row contains normalized values
                            writer.AddToCurrentLine(user.OsbideId);
                            writer.AddToCurrentLine(gradeMap[i]);
                            int totalTransitions = user.TransitionCounts.Values.Sum();

                            //only use the interesting states as columns as identified in the aggregate analysis
                            foreach (KeyValuePair <string, int> kvp in interestingTransitions)
                            {
                                if (user.TransitionCounts.ContainsKey(kvp.Key) == true)
                                {
                                    writer.AddToCurrentLine(user.TransitionCounts[kvp.Key] / (double)totalTransitions);
                                }
                                else
                                {
                                    writer.AddToCurrentLine("0");
                                }
                            }
                            writer.CreateNewRow();
                        }
                        using (TextWriter tw = File.CreateText(string.Format("{0}/individual_{1}_{2}.csv", outputPath, sequenceLength, gradeMap[i])))
                        {
                            tw.Write(writer.ToString());
                        }
                    }
                }

                //move to the next assignment
                assignmentCounter++;
            }
        }