Beispiel #1
0
        public List<Token> correct(string word)
        {
            var token = new Token( word );
            token.origTerm = word;
            var result = new List<Token>();

            Console.WriteLine("correct : " + word );

            // if its an number, just leave it alone.
            if (!word.IsInt() )
            {

                var hs = Edits1(new List<Token>() { token });

                // add original word
                hs.Add(token);

                var hs2 = EditAgain(hs, 2);

                // cant just add them? tried union, but returns IEnumerable.
                foreach( var s in hs2)
                {
                    hs.Add( s ) ;
                }

                var finalHS = hs;

                //Console.WriteLine("dictionary size " + dictionary.Count.ToString());

                //Console.WriteLine("result size " + finalHS.Count.ToString() );

                var wordDict = new Dictionary<string, List<Token>>();

                // old fashioned way
                // get every single word that is legit.
                // add to dictionary.
                // determine "best" version of each word (ie, highest score for each of the tokens)
                foreach (Token s in finalHS)
                {
                    //Console.WriteLine("testing {0}", s.term );

                    if (dictionary.ContainsKey(s.term ))
                    {
                        //Console.WriteLine("{0} in dictionary", s.term );

                        // initial score.
                        s.score = dictionary.getOrElse(s.term, 1.0);

                        //Console.WriteLine("term score :" + s.term + " : " + s.score.ToString() );

                        // this will create a new list for every single call... v v v wasteful. Need to modify.
                        var l = wordDict.getOrElseAssign(s.term, new List<Token>() );
                        l.Add(s);

                    }
                }

                // now for each collection of tokens that all represent the same word, go and rank them, and only take "the best"
                foreach (var k in wordDict.Keys)
                {

                    // all same word.
                    var l = wordDict[k];

                    var t = ranker.RankWordTokens(l);

                    // should just be BEST version of the word.
                    result.Add(t);

                }
            }
            else
            {
                token.score = 1.0;
                result.Add( token );
            }
            return result.ToList<Token>();
        }
Beispiel #2
0
        private int GetModificationTypeCount(Token t, ModificationType modType )
        {
            int c = 0;
            foreach (var m in t.modifications)
            {
                if (m.modType == modType)
                {
                    ++c;
                }
            }

            return c;
        }
Beispiel #3
0
        HashSet<Token> Edits1(List<Token> tokenList)
        {
            var hs = new HashSet<Token>();

            // make sure original is added.
            foreach (var w in tokenList)
            {
                hs.Add(w);
            }

            // deletion.
            foreach (var w in tokenList)
            {
                //Console.WriteLine("word: " + w.term);
                foreach (Tuple<string, string> t in split( w.term  ))
                {
                    if (t.Item2 != null && t.Item2 != "")
                    {
                        var newWord = t.Item1 + t.Item2.Substring(1);
                        var token = new Token( newWord );
                        token.origTerm = w.origTerm;

                        var mod = new Modification();
                        mod.modType = ModificationType.Delete;
                        mod.origChar = t.Item2[0];

                        token.modifications = w.modifications.GetRange(0, w.modifications.Count);

                        token.modifications.Add( mod );
                        hs.Add(token);
                    }
                }

            }

            // transposes
            foreach (var w in tokenList)
            {
                foreach (Tuple<string, string> t in split(w.term))
                {
                    if (t.Item2.Length > 1)
                    {

                        // only if two letters aren't the same
                        if ( t.Item2[0] != t.Item2[1] )
                        {
                            var newWord = t.Item1 + t.Item2[1] + t.Item2[0] + t.Item2.Substring(2);
                            var token = new Token(newWord);
                            token.origTerm = w.origTerm;

                            var mod = new Modification();
                            mod.modType = ModificationType.Transpose;
                            mod.origChar = t.Item2[0];
                            mod.newChar = t.Item2[1];
                            token.modifications = w.modifications.GetRange(0, w.modifications.Count);
                            token.modifications.Add( mod );

                            hs.Add(token);
                        }
                    }
                }
            }

            // replacement.
            foreach (var w in tokenList)
            {
                foreach (Tuple<string, string> t in split(w.term))
                {
                    if (t.Item2 != "")
                    {
                        foreach (char c in alphabet)
                        {
                            // only replace if different char.
                            if ( c != t.Item2[0] )
                            {
                                var newWord = t.Item1 + c + t.Item2.Substring(1);
                                var token = new Token(newWord);
                                token.origTerm = w.origTerm;

                                var mod = new Modification();
                                mod.modType = ModificationType.Replace;
                                mod.origChar = t.Item2[0];
                                mod.newChar = c;
                                token.modifications = w.modifications.GetRange(0, w.modifications.Count);
                                token.modifications.Add( mod );

                                hs.Add(token);
                            }

                        }
                    }

                }

            }

            // inserts.
            foreach (var w in tokenList)
            {
                foreach (Tuple<string, string> t in split(w.term))
                {
                    if (t.Item2 != "")
                    {
                        foreach (char c in alphabet)
                        {
                            var newWord = t.Item1 + c + t.Item2;
                            var token = new Token(newWord);
                            token.origTerm = w.origTerm;
                            //Console.WriteLine("orig term " + token.origTerm );
                            //Console.WriteLine("new term " + token.term );

                            var mod = new Modification();
                            mod.modType = ModificationType.Insert;
                            mod.origChar = c;
                            token.modifications = w.modifications.GetRange(0, w.modifications.Count);
                            token.modifications.Add( mod );
                            hs.Add(token);

                        }
                    }
                    else
                    {
                        // just appending to end... seem legit.
                        foreach (char c in alphabet)
                        {
                            var newWord = t.Item1 + c;
                            var token = new Token(newWord);
                            token.origTerm = w.origTerm;
                            var mod = new Modification();
                            mod.modType = ModificationType.Insert;
                            mod.origChar = c;
                            //Console.WriteLine("orig term " + token.origTerm );
                            //Console.WriteLine("new term " + token.term );

                            token.modifications = w.modifications.GetRange(0, w.modifications.Count);
                            token.modifications.Add( mod );
                            hs.Add(token);

                        }

                    }

                }

            }

            return hs;
        }