public List<Token> correct(string word) { var token = new Token( word ); token.origTerm = word; var result = new List<Token>(); Console.WriteLine("correct : " + word ); // if its an number, just leave it alone. if (!word.IsInt() ) { var hs = Edits1(new List<Token>() { token }); // add original word hs.Add(token); var hs2 = EditAgain(hs, 2); // cant just add them? tried union, but returns IEnumerable. foreach( var s in hs2) { hs.Add( s ) ; } var finalHS = hs; //Console.WriteLine("dictionary size " + dictionary.Count.ToString()); //Console.WriteLine("result size " + finalHS.Count.ToString() ); var wordDict = new Dictionary<string, List<Token>>(); // old fashioned way // get every single word that is legit. // add to dictionary. // determine "best" version of each word (ie, highest score for each of the tokens) foreach (Token s in finalHS) { //Console.WriteLine("testing {0}", s.term ); if (dictionary.ContainsKey(s.term )) { //Console.WriteLine("{0} in dictionary", s.term ); // initial score. s.score = dictionary.getOrElse(s.term, 1.0); //Console.WriteLine("term score :" + s.term + " : " + s.score.ToString() ); // this will create a new list for every single call... v v v wasteful. Need to modify. var l = wordDict.getOrElseAssign(s.term, new List<Token>() ); l.Add(s); } } // now for each collection of tokens that all represent the same word, go and rank them, and only take "the best" foreach (var k in wordDict.Keys) { // all same word. var l = wordDict[k]; var t = ranker.RankWordTokens(l); // should just be BEST version of the word. result.Add(t); } } else { token.score = 1.0; result.Add( token ); } return result.ToList<Token>(); }
private int GetModificationTypeCount(Token t, ModificationType modType ) { int c = 0; foreach (var m in t.modifications) { if (m.modType == modType) { ++c; } } return c; }
HashSet<Token> Edits1(List<Token> tokenList) { var hs = new HashSet<Token>(); // make sure original is added. foreach (var w in tokenList) { hs.Add(w); } // deletion. foreach (var w in tokenList) { //Console.WriteLine("word: " + w.term); foreach (Tuple<string, string> t in split( w.term )) { if (t.Item2 != null && t.Item2 != "") { var newWord = t.Item1 + t.Item2.Substring(1); var token = new Token( newWord ); token.origTerm = w.origTerm; var mod = new Modification(); mod.modType = ModificationType.Delete; mod.origChar = t.Item2[0]; token.modifications = w.modifications.GetRange(0, w.modifications.Count); token.modifications.Add( mod ); hs.Add(token); } } } // transposes foreach (var w in tokenList) { foreach (Tuple<string, string> t in split(w.term)) { if (t.Item2.Length > 1) { // only if two letters aren't the same if ( t.Item2[0] != t.Item2[1] ) { var newWord = t.Item1 + t.Item2[1] + t.Item2[0] + t.Item2.Substring(2); var token = new Token(newWord); token.origTerm = w.origTerm; var mod = new Modification(); mod.modType = ModificationType.Transpose; mod.origChar = t.Item2[0]; mod.newChar = t.Item2[1]; token.modifications = w.modifications.GetRange(0, w.modifications.Count); token.modifications.Add( mod ); hs.Add(token); } } } } // replacement. foreach (var w in tokenList) { foreach (Tuple<string, string> t in split(w.term)) { if (t.Item2 != "") { foreach (char c in alphabet) { // only replace if different char. if ( c != t.Item2[0] ) { var newWord = t.Item1 + c + t.Item2.Substring(1); var token = new Token(newWord); token.origTerm = w.origTerm; var mod = new Modification(); mod.modType = ModificationType.Replace; mod.origChar = t.Item2[0]; mod.newChar = c; token.modifications = w.modifications.GetRange(0, w.modifications.Count); token.modifications.Add( mod ); hs.Add(token); } } } } } // inserts. foreach (var w in tokenList) { foreach (Tuple<string, string> t in split(w.term)) { if (t.Item2 != "") { foreach (char c in alphabet) { var newWord = t.Item1 + c + t.Item2; var token = new Token(newWord); token.origTerm = w.origTerm; //Console.WriteLine("orig term " + token.origTerm ); //Console.WriteLine("new term " + token.term ); var mod = new Modification(); mod.modType = ModificationType.Insert; mod.origChar = c; token.modifications = w.modifications.GetRange(0, w.modifications.Count); token.modifications.Add( mod ); hs.Add(token); } } else { // just appending to end... seem legit. foreach (char c in alphabet) { var newWord = t.Item1 + c; var token = new Token(newWord); token.origTerm = w.origTerm; var mod = new Modification(); mod.modType = ModificationType.Insert; mod.origChar = c; //Console.WriteLine("orig term " + token.origTerm ); //Console.WriteLine("new term " + token.term ); token.modifications = w.modifications.GetRange(0, w.modifications.Count); token.modifications.Add( mod ); hs.Add(token); } } } } return hs; }