Пример #1
0
        public void LookupShouldReturnMostFrequent()
        {
            var symSpell = new SymSpell();

            symSpell.CreateDictionaryEntry("steama", 4);
            symSpell.CreateDictionaryEntry("steamb", 6);
            symSpell.CreateDictionaryEntry("steamc", 2);
            var result = symSpell.Lookup("steam", SymSpell.Verbosity.Top, 2);

            Assert.AreEqual(1, result.Count);
            Assert.AreEqual("steamb", result[0].term);
            Assert.AreEqual(6, result[0].count);
        }
Пример #2
0
        public void VerbosityShouldControlLookupResults()
        {
            var symSpell = new SymSpell();

            symSpell.CreateDictionaryEntry("steam", 1);
            symSpell.CreateDictionaryEntry("steams", 2);
            symSpell.CreateDictionaryEntry("steem", 3);
            var result = symSpell.Lookup("steems", SymSpell.Verbosity.Top, 2);

            Assert.AreEqual(1, result.Count);
            result = symSpell.Lookup("steems", SymSpell.Verbosity.Closest, 2);
            Assert.AreEqual(2, result.Count);
            result = symSpell.Lookup("steems", SymSpell.Verbosity.All, 2);
            Assert.AreEqual(3, result.Count);
        }
Пример #3
0
        public void LookupShouldNotReturnLowCountWord()
        {
            var symSpell = new SymSpell(16, 2, 7, 10);

            symSpell.CreateDictionaryEntry("pawn", 1);
            var result = symSpell.Lookup("pawn", SymSpell.Verbosity.Top, 0);

            Assert.AreEqual(0, result.Count);
        }
Пример #4
0
        public void AddAdditionalCountsShouldNotOverflow()
        {
            var symSpell = new SymSpell();
            var word     = "hello";

            symSpell.CreateDictionaryEntry(word, long.MaxValue - 10);
            var  result = symSpell.Lookup(word, SymSpell.Verbosity.Top);
            long count  = 0;

            if (result.Count == 1)
            {
                count = result[0].count;
            }
            Assert.AreEqual(long.MaxValue - 10, count);
            symSpell.CreateDictionaryEntry(word, 11);
            result = symSpell.Lookup(word, SymSpell.Verbosity.Top);
            count  = 0;
            if (result.Count == 1)
            {
                count = result[0].count;
            }
            Assert.AreEqual(long.MaxValue, count);
        }
Пример #5
0
        public void AddAdditionalCountsShouldIncreaseCount()
        {
            var symSpell = new SymSpell();
            var word     = "hello";

            symSpell.CreateDictionaryEntry(word, 11);
            var  result = symSpell.Lookup(word, SymSpell.Verbosity.Top);
            long count  = 0;

            if (result.Count == 1)
            {
                count = result[0].count;
            }
            Assert.AreEqual(11, count);
            symSpell.CreateDictionaryEntry(word, 3);
            result = symSpell.Lookup(word, SymSpell.Verbosity.Top);
            count  = 0;
            if (result.Count == 1)
            {
                count = result[0].count;
            }
            Assert.AreEqual(11 + 3, count);
        }
Пример #6
0
        static void Main(string[] args)
        {
            var path        = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.txt";
            var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.index";

            // var path = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.txt";
            // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.index";
            // var path = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.txt";
            // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.index";
            var dict      = new Dictionary <string, int>();
            var frequency = new Dictionary <string, int>();
            var inverter  = new Dictionary <int, HashSet <int> >();
            var documents = new List <string>();

            // try indexing by using invert index
            var count       = 0;
            var stringIndex = 0;

            Console.WriteLine("Reading and indexing ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            var trie = new PatriciaSuffixTrie <string>(1);

            using (var sr = new StreamReader(path)) {
                string s = null;
                while ((s = sr.ReadLine()) != null)
                {
                    documents.Add(s);
                    foreach (var word in s.Split(' '))
                    {
                        int tmp;
                        if (!dict.TryGetValue(word, out tmp))
                        {
                            dict[word]      = ++count;
                            frequency[word] = 1;
                        }
                        else
                        {
                            frequency[word]++;
                        }

                        HashSet <int> tmpStringIndexes;
                        if (!inverter.TryGetValue(dict[word], out tmpStringIndexes))
                        {
                            tmpStringIndexes = new HashSet <int>();
                            inverter[count]  = tmpStringIndexes;
                        }
                        tmpStringIndexes.Add(stringIndex);
                    }
                    stringIndex++;
                }
            }

            Console.WriteLine("Adding to completion dict...");
            memSize = GC.GetTotalMemory(true);
            foreach (var entry in dict)
            {
                trie.Add(entry.Key, entry.Key);
            }
            Console.WriteLine("Add to completion dict: " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + dict.Count);

            stopWatch = new Stopwatch();
            stopWatch.Start();
            Console.WriteLine("Adding to symSpell for fast spellCheck");
            memSize = GC.GetTotalMemory(true);
            // dictionary for symspell
            var spellChecker = new SymSpell(dict.Count, 2);

            foreach (var entry in frequency)
            {
                spellChecker.CreateDictionaryEntry(entry.Key, entry.Value);
            }


            Console.WriteLine("Spell dictionary constructed. " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + "MB, " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms. Tokens:" + frequency.Count);
            stopWatch = new Stopwatch();
            stopWatch.Start();
            Console.WriteLine("Saving index ...");
            stopWatch.Stop();
            long byteCount = SaveIndex(indexerPath, trie, spellChecker, inverter, dict, documents);

            stopWatch = new Stopwatch();
            Console.WriteLine("File saved: " + byteCount + " bytes. Time ellapsed: " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms");
            stopWatch.Start();
            Console.WriteLine("Searching ...");
            var hits = Search("342 cw", trie, spellChecker, inverter, dict, documents, 10);

            stopWatch.Stop();
            var timeEllapsed = stopWatch.Elapsed.TotalMilliseconds.ToString("0.0");

            // foreach(var hit in hits) {
            //     Console.WriteLine("--> " + hit.value);
            // }

            Console.WriteLine("Searching done." + timeEllapsed + "ms. Hits:" + hits.Length);
        }
Пример #7
0
        public void parse(string line, bool input, Pattern p, int nline)
        {
            char status = 's';
            //int ind =0;
            string holder    = "";
            string _operator = "";

            line += ' ';

            if (!input)
            {
                p.responses.Add(new List <Action>());
            }

            var vTarget = input ? p.input : p.responses[p.responses.Count - 1];

            for (int ind = 0; ind < line.Length; ind++)
            {
                char c = line[ind];

                if (c == ' ' && status == 's')
                {
                    continue;
                }

                if (c == ' ' && (status == 'a' || status == 'o'))
                {
                    status = 's';

                    if (_operator.Equals(""))
                    {
                        _operator = " ";
                    }

                    if (!_operator.Equals("$") && !_operator.Equals("$$"))
                    {
                        vTarget.Add(new Action(_operator, holder, nline, ind + 1));
                    }
                    else if (_operator.Equals("$$"))
                    {
                        p.contextRouter = holder.Trim();
                        Console.WriteLine("SETTING route as {0}", holder);
                    }
                    else
                    {
                        Console.WriteLine("Route as {0}", holder);
                        p.context = holder.Trim();
                    }

                    if (!_operator.Equals("*") && input)
                    {
                        p.tranferTerms++;
                    }

                    if (input)
                    {
                        corrector.CreateDictionaryEntry(holder, 3);
                    }

                    if (_operator == "->" || _operator == "@")
                    {
                        p.hasSubject = true;
                    }

                    holder    = "";
                    _operator = "";
                    continue;
                }

                if (c == ' ' && status == 'w')
                {
                    status = 's';

                    if (!_operator.Equals("$") && !_operator.Equals("$$"))
                    {
                        vTarget.Add(new Action(" ", holder, nline, ind + 1));
                    }
                    else if (_operator.Equals("$$"))
                    {
                        p.contextRouter = holder.Trim();
                        Console.WriteLine("SETTING route as {0}", holder);
                    }
                    else
                    {
                        Console.WriteLine("Route as {0}", holder);
                        p.context = holder.Trim();
                    }

                    if (!_operator.Equals("*") && input)
                    {
                        p.tranferTerms++;
                    }

                    if (input)
                    {
                        corrector.CreateDictionaryEntry(holder, 3);
                    }
                    holder    = "";
                    _operator = "";
                    continue;
                }

                if (c == '#')
                {
                    break;
                }

                if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) ||
                    (c >= 48 && c <= 57) ||
                    (status == 'n' && c != '_') || status == 'w' || status == 'a')
                {
                    if (status == 's')
                    {
                        status = 'w';
                    }

                    if (status == 'w')
                    {
                        holder += c;
                        continue;
                    }

                    if (status == 'o')
                    {
                        status = 'a';
                    }

                    if (status == 'a')
                    {
                        holder += c;
                        continue;
                    }

                    if (status == 'n')
                    {
                        _operator += c;
                    }
                }
                else
                {
                    if (c == '_')
                    {
                        if (status == 's')
                        {
                            status = 'n';
                        }
                        else if (status == 'n')
                        {
                            status = 'a';
                        }
                        continue;
                    }
                    else
                    {
                        if (status == 's')
                        {
                            status = 'o';
                        }
                        if (status == 'o')
                        {
                            _operator += c;
                        }
                    }
                }
            }
        }
Пример #8
0
        //Load a frequency dictionary or create a frequency dictionary from a text corpus
        public static void Main(string[] args)
        {
            var path = AppDomain.CurrentDomain.BaseDirectory + @"all-suggests-cleaned.txt";

            Console.Write("Creating trie ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();
            var wordToIndex   = new Dictionary <string, int>();
            var wordFrequency = new Dictionary <string, int>();
            var phraseList    = new List <string>();
            int count         = 0;

            using (StreamReader sr = new StreamReader(path))
            {
                while (sr.Peek() >= 0)
                {
                    var s = sr.ReadLine();
                    phraseList.Add(s.Trim());

                    var tokens = s.Trim().Split(' ');

                    for (int i = 0; i < tokens.Length; ++i)
                    {
                        int index = 0, freq = 0;
                        if (!wordToIndex.TryGetValue(tokens[i], out index))
                        {
                            wordToIndex[tokens[i]] = count++;
                        }
                        if (!wordFrequency.TryGetValue(tokens[i], out freq))
                        {
                            wordFrequency[tokens[i]] = 1;
                        }
                        else
                        {
                            wordFrequency[tokens[i]] = freq + 1;
                        }
                    }
                }
            }

            long memDeltaForStoringValues = GC.GetTotalMemory(true) - memSize;

            Console.WriteLine("Memory for storing value: " + memDeltaForStoringValues + ". Going to add to trie");

            var trie  = new UkkonenTrie <int>(1);
            int value = 0;

            foreach (var phrase in phraseList)
            {
                trie.Add(phrase, value++);
            }

            //Load a frequency dictionary
            stopWatch.Stop();
            long memDelta = GC.GetTotalMemory(true) - memSize;

            Console.WriteLine("Done in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                              + (memDelta / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + wordToIndex.Count);

            // spell checker
            var spellChecker = new SymSpell(wordToIndex.Count, 2);

            foreach (var entry in wordFrequency)
            {
                spellChecker.CreateDictionaryEntry(entry.Key, entry.Value);
            }

            while (true)
            {
                Console.WriteLine("Input string to search:");
                var s = Console.ReadLine();
                if (s == "exit")
                {
                    return;
                }

                var normalized = s.ToLower();
                var suggests   = spellChecker.LookupCompound(normalized, 2);

                // lookup in trie
                var results = trie.Retrieve(normalized);

                var resultCount = 0;
                foreach (var result in results)
                {
                    Console.WriteLine("--> " + phraseList[result]);
                    resultCount++;
                }

                var suggest = suggests[0].term;
                foreach (var sug in suggests)
                {
                    Console.WriteLine("Can search for: " + sug.term);
                }
                if (suggest != normalized)
                {
                    Console.WriteLine("Did you mean: " + suggest + "?");
                }

                Console.WriteLine(String.Format("Found {0} result", resultCount));
            }
        }