/// <summary> /// Similar to <see cref="BuildTransitionFrequencyCounts"/>, but instead creates a single file /// per transition length for all students and all loaded files. /// </summary> /// <param name="vm"></param> private void AggregateTransitionFrequencyCounts(TimelineAnalysisViewModel vm) { //step 1: get list of files to process List <string> filesToProcess = new List <string>(); string fileName = "a"; Console.WriteLine("Enter files to process (-1 to stop)"); while ((fileName = GetFile()).Length > 0) { filesToProcess.Add(fileName); } //load all data into VM vm.LoadTimeline(filesToProcess[0]); for (int i = 1; i < filesToProcess.Count; i++) { vm.AppendTimeline(filesToProcess[i]); } //step2: get sequence boundaries. Again, hard coded for now int startingSequenceLength = 2; int endingSequenceLength = 25; //this produces a lot of files, so create a separate directory for the output string outputDirectory = "AggregateTransitionFrequencyCounts"; if (Directory.Exists(outputDirectory) == false) { Directory.CreateDirectory(outputDirectory); } /* * What I need to do: * Get all sequences. * For each sequence: * Determine if similar to other known sequences. If so, combine into same set. (disjoint set?) * */ Dictionary <int, Dictionary <string, int> > allTransitions = new Dictionary <int, Dictionary <string, int> >(); //begin file processing for (int sequenceLength = startingSequenceLength; sequenceLength <= endingSequenceLength; sequenceLength++) { //get grade data vm.AttachGrades(); //build markov transitions vm.BuildDefaultMarkovStates(); //figure out sequence distribution for entire data set and for individual students Dictionary <string, int> transitions = vm.GetAllTransitionCombinations(sequenceLength); //filter out singletons var smallKeys = transitions.Where(t => t.Value < 5).Select(t => t.Key).ToList(); foreach (string key in smallKeys) { transitions.Remove(key); } //save for future use allTransitions.Add(sequenceLength, transitions); Console.WriteLine("Loaded transitions of length {0}.", sequenceLength); } //use Needleman-Wunsch algorithm and disjoint sets to combine similar sequences DisjointSet <string> matches = new DisjointSet <string>(); int matchCount = 0; //start with large sequences as it will make it more likely that these will be the "top" of the disjoint set int startingNumber = (int)'a'; for (int sequenceLength = endingSequenceLength; sequenceLength >= startingSequenceLength; sequenceLength--) { Console.WriteLine("Matching sequences of length {0}", sequenceLength); //Needleman-Wunsch works on single characters, so we need to transform Markov-like numbers to letters Dictionary <string, int> originalSequences = allTransitions[sequenceLength]; Dictionary <string, int> modifiedSequences = new Dictionary <string, int>(); foreach (var kvp in originalSequences) { //convert into numbers int[] pieces = kvp.Key.Split('_').Select(k => Convert.ToInt32(k) + startingNumber).ToArray(); //then, convert back to characters char[] sequence = pieces.Select(p => Convert.ToChar(p)).ToArray(); //and finally into a string string charSequence = string.Join("_", sequence); //lastly, remember this sequence modifiedSequences.Add(charSequence, kvp.Value); } //prime the disjoint set foreach (string key in modifiedSequences.Keys) { matches.Find(key); } //having converted to character state representations, now run the Needleman-Wunsch algorithm List <string> sequences = modifiedSequences.Keys.ToList(); for (int i = 0; i < sequences.Count; i++) { for (int j = i + 1; j < sequences.Count; j++) { string first = matches.Find(sequences[i]); string second = matches.Find(sequences[j]); //automatically count sequences as the same when one sequence is a complete substring of another sequence string firstSequence = sequences[i]; string secondSequence = sequences[j]; if (firstSequence.Replace(secondSequence, "").Length == 0 || secondSequence.Replace(firstSequence, "").Length == 0 ) { matches.UnionWith(first, second); matchCount++; } else { //Use NW to check for alignment //align the two sequences var result = NeedlemanWunsch.Align(first, second); //if score is similar, then count the sequences as the same (union) if ((double)NeedlemanWunsch.ScoreNpsmSequence(result.Item1, result.Item2) < 3) { matches.UnionWith(first, second); matchCount++; } } } } } //now, get all sets and figure out popularity of each set Console.WriteLine("{0} unions performed.", matchCount); List <List <string> > allSets = matches.AllSets(); List <List <string> > smallerSets = allSets.Where(s => s.Count > 1).ToList(); Dictionary <string, int> popularityDict = new Dictionary <string, int>(); Console.WriteLine("Calculating popularity of {0} sets...", allSets.Count); foreach (List <string> set in allSets) { foreach (string item in set) { //convert back to Markov-style transitions int[] pieces = item.Split('_').Select(c => Convert.ToChar(c)).Select(c => (int)c - startingNumber).ToArray(); string key = string.Join("_", pieces); if (popularityDict.ContainsKey(key) == false) { popularityDict.Add(key, 0); } //add in counts to the popularity dictionary popularityDict[key] += allTransitions[pieces.Length][key]; } } //write this information to a file CsvWriter writer = new CsvWriter(); //aggregate class results Console.WriteLine("Writing most popular sequences to file."); foreach (KeyValuePair <string, int> kvp in popularityDict.OrderByDescending(p => p.Value)) { int[] pieces = kvp.Key.Split('_').Select(c => Convert.ToInt32(c)).ToArray(); string npsmKey = string.Join("_", pieces.Select(p => vm.StateNumberToNpsmString(p)).ToArray()); writer.AddToCurrentLine(npsmKey); writer.AddToCurrentLine(kvp.Value.ToString()); writer.CreateNewRow(); } using (TextWriter tw = File.CreateText(string.Format("popular_sequences.csv"))) { tw.Write(writer.ToString()); } }
/* * What I want to do: * For each assignment: * figure out common sequences of length m to n * For each student, for each grade band (A-F), again determine frequences of length m to n * Build a frequency distribution for each grade band by sequence * */ private void BuildTransitionFrequencyCounts(TimelineAnalysisViewModel vm) { //step 1: get list of files to process List <string> filesToProcess = new List <string>(); string fileName = "a"; Console.WriteLine("Enter files to process (-1 to stop)"); while ((fileName = GetFile()).Length > 0) { filesToProcess.Add(fileName); } //step 2: setup grade-bands (e.g. A, B, C, etc.) Hard coded for now as this is just for a //single class double maxScore = 200; double[] gradeRanges = { 90, 78, 69, 60, 0 }; string[] gradeMap = { "A", "B", "C", "D", "F" }; //step 3: get sequence boundaries. Again, hard coded for now int startingSequenceLength = 2; int endingSequenceLength = 25; //step 4: get assignments. string[] assignments = { "Assignment #1", "Assignment #2", "Assignment #3", "Assignment #4", "Assignment #5", "Assignment #6", "Assignment #7" }; int assignmentCounter = 0; //this produces a lot of files, so create a separate directory for the output string outputDirectory = "TransitionFrequencyCounts"; if (Directory.Exists(outputDirectory) == false) { Directory.CreateDirectory(outputDirectory); } //finally, begin processing foreach (string fileToProcess in filesToProcess) { string folderName = fileToProcess.Replace("#", ""); string outputPath = Path.Combine(outputDirectory, folderName); if (Directory.Exists(outputPath) == false) { Directory.CreateDirectory(outputPath); } for (int sequenceLength = startingSequenceLength; sequenceLength <= endingSequenceLength; sequenceLength++) { //reset max score for A students maxScore = 200; //based on currently existing code, it is easier to reopen the file for //each grade range for (int i = 0; i < gradeRanges.Length; i++) { double bound = gradeRanges[i]; //reload the file LoadFile(fileToProcess); //get grade data vm.AttachGrades(); //filter based on grade data vm.FilterByGrade(assignments[assignmentCounter], bound, maxScore); //update scores for next grade boundary maxScore = bound - 0.01; //build markov transitions vm.BuildDefaultMarkovStates(); //figure out sequence distribution for entire data set and for individual students Dictionary <string, int> transitions = vm.GetAllTransitionCombinations(sequenceLength); //interesting transitions are those in which we have at least 5 occurrances var interestingTransitions = transitions.Where(t => t.Value > 5).OrderBy(t => t.Value).ToList(); //write this information to a file CsvWriter writer = new CsvWriter(); //aggregate class results Console.WriteLine("Processing transition sequences of length {0}...", sequenceLength); foreach (KeyValuePair <string, int> kvp in interestingTransitions) { writer.AddToCurrentLine(kvp.Key); writer.AddToCurrentLine(kvp.Value.ToString()); writer.CreateNewRow(); } using (TextWriter tw = File.CreateText(string.Format("{0}/aggregate_{1}_{2}.csv", outputPath, sequenceLength, gradeMap[i]))) { tw.Write(writer.ToString()); } //individual students //add header data writer = new CsvWriter(); writer.AddToCurrentLine("UserId"); writer.AddToCurrentLine("Grade"); foreach (var kvp in interestingTransitions) { writer.AddToCurrentLine(kvp.Key); } writer.CreateNewRow(); foreach (var user in vm.Timeline.Values) { //first row for users is raw values writer.AddToCurrentLine(user.OsbideId); writer.AddToCurrentLine(gradeMap[i]); //only use the interesting states as columns as identified in the aggregate analysis foreach (KeyValuePair <string, int> kvp in interestingTransitions) { if (user.TransitionCounts.ContainsKey(kvp.Key) == true) { writer.AddToCurrentLine(user.TransitionCounts[kvp.Key]); } else { writer.AddToCurrentLine("0"); } } writer.CreateNewRow(); //2nd row contains normalized values writer.AddToCurrentLine(user.OsbideId); writer.AddToCurrentLine(gradeMap[i]); int totalTransitions = user.TransitionCounts.Values.Sum(); //only use the interesting states as columns as identified in the aggregate analysis foreach (KeyValuePair <string, int> kvp in interestingTransitions) { if (user.TransitionCounts.ContainsKey(kvp.Key) == true) { writer.AddToCurrentLine(user.TransitionCounts[kvp.Key] / (double)totalTransitions); } else { writer.AddToCurrentLine("0"); } } writer.CreateNewRow(); } using (TextWriter tw = File.CreateText(string.Format("{0}/individual_{1}_{2}.csv", outputPath, sequenceLength, gradeMap[i]))) { tw.Write(writer.ToString()); } } } //move to the next assignment assignmentCounter++; } }