예제 #1
0
        static void TestTime()
        {
            int repeatCount       = 1000;
            int currentWordLength = 0;
            int finalWordLength   = 16;

            for (; currentWordLength < finalWordLength; currentWordLength++)
            {
                Console.WriteLine("Word length: " + currentWordLength);

                foreach (StringDistance.Measure method in Enum.GetValues(typeof(StringDistance.Measure)))
                {
                    string a = GetRandomWord(currentWordLength);
                    string b = GetRandomWord(currentWordLength);

                    Stopwatch watch = new Stopwatch();
                    watch.Start();
                    for (int i = 0; i < repeatCount; i++)
                    {
                        StringDistance measureMethod = StringDistance.StringDistanceBuilder.GetInstance(method, a, b);
                        int            result        = measureMethod.GetDistance();
                    }
                    watch.Stop();

                    long   ticks    = watch.ElapsedTicks;
                    double avgTicks = (double)ticks / repeatCount;
                    Console.WriteLine("Method: " + method.ToString() + "; Ticks average: " + avgTicks);
                }

                Console.WriteLine();
            }
        }
예제 #2
0
        static void Main(string[] args)
        {
            if (args.Length > 0 && args[0] == "-t")
            {
                TestTime();
                Console.WriteLine("TESTING COMPLETED");
                Console.ReadKey();
                return;
            }

            if (args.Length > 0 && args[0] == "-rt")
            {
                TestTree();
                Console.WriteLine("TESTING COMPLETED");
                Console.ReadKey();
                return;
            }

            string a, b;

            Console.Write("Первая строка: ");
            a = Console.ReadLine();
            Console.Write("Вторая строка: ");
            b = Console.ReadLine();

            Console.WriteLine("\nРезультат: \n");

            foreach (StringDistance.Measure measure in Enum.GetValues(typeof(StringDistance.Measure)))
            {
                StringDistance distance = StringDistance.StringDistanceBuilder.GetInstance(measure, a, b);

                if (distance == null)
                {
                    continue;
                }

                System.Diagnostics.Stopwatch stopwatch = new System.Diagnostics.Stopwatch();


                GC.Collect();

                stopwatch.Start();

                int result = distance.GetDistance();

                stopwatch.Stop();

                LetterMatrix matrix = distance.GetLetterMatrix();
                Console.WriteLine("Метод: " + distance.MethodName);
                Console.WriteLine("Значение: " + result);
                Console.WriteLine("Матрица: ");
                Console.Write(matrix.ToString());

                Console.WriteLine("Прошло времени (тиков): " + stopwatch.ElapsedTicks);
                Console.WriteLine("Прошло времени (секунд): " + stopwatch.ElapsedMilliseconds / 1000f);
                Console.WriteLine("\n");
            }

            Console.ReadKey();
        }
        public float GetDistance(string s1, string s2)
        {
            if (s1.Length < s2.Length || _parentDistance.GetDistance(s1.Substring(0, s2.Length), s2) != 1)
            {
                return(0);
            }

            if (s1.Length == s2.Length)
            {
                return(1);
            }

            return(1 / (float)(s2.Length - s1.Length));
        }
예제 #4
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="numSug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
        {    // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();

            try
            {
                float min        = this.minScore;
                int   lengthWord = word.Length;

                int freq     = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (!morePopular && freq > 0)
                {
                    return(new String[] { word });
                }

                var      query = new BooleanQuery();
                String[] grams;
                String   key;

                var alreadySeen = new HashSet <string>();
                for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {
                    key = "gram" + ng;           // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0)
                    {                                               // should we boost prefixes?
                        Add(query, "start" + ng, grams[0], bStart); // matches start of word
                    }
                    if (bEnd > 0)
                    {                                                          // should we boost suffixes
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int         stop    = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {
                    sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if (sugWord.termString.Equals(word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.termString);
                    if (sugWord.score < min)
                    {
                        continue;
                    }

                    if (ir != null && field != null)
                    {                                                                   // use the user index
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }

                    if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice
                    {
                        continue;
                    }

                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        min = ((SuggestWord)sugQueue.Top()).score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                String[] list = new String[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = ((SuggestWord)sugQueue.Pop()).termString;
                }

                return(list);
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }
예제 #5
0
        public void TestGetDistance()
        {
            float d = sd.GetDistance("al", "al");

            Assert.AreEqual(d, 1.0f, 0.001);
            d = sd.GetDistance("martha", "marhta");
            Assert.AreEqual(d, 0.6666, 0.001);
            d = sd.GetDistance("jones", "johnson");
            Assert.AreEqual(d, 0.4285, 0.001);
            d = sd.GetDistance("abcvwxyz", "cabvwxyz");
            Assert.AreEqual(d, 0.75, 0.001);
            d = sd.GetDistance("dwayne", "duane");
            Assert.AreEqual(d, 0.666, 0.001);
            d = sd.GetDistance("dixon", "dicksonx");
            Assert.AreEqual(d, 0.5, 0.001);
            d = sd.GetDistance("six", "ten");
            Assert.AreEqual(d, 0, 0.001);
            float d1 = sd.GetDistance("zac ephron", "zac efron");
            float d2 = sd.GetDistance("zac ephron", "kai ephron");

            Assert.AreEqual(d1, d2, 0.001);
            d1 = sd.GetDistance("brittney spears", "britney spears");
            d2 = sd.GetDistance("brittney spears", "brittney startzman");
            Assert.True(d1 > d2);
        }
예제 #6
0
        /// <summary>
        /// Suggest similar words (optionally restricted to a field of an index).
        ///
        /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
        /// is not the same as the edit distance strategy used to calculate the best
        /// matching spell-checked word from the hits that Lucene found, one usually has
        /// to retrieve a couple of numSug's in order to get the true best match.
        ///
        /// </para>
        /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one.
        /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
        ///
        /// </para>
        /// </summary>
        /// <param name="word"> the word you want a spell check done on </param>
        /// <param name="numSug"> the number of suggested words </param>
        /// <param name="ir"> the indexReader of the user index (can be null see field param) </param>
        /// <param name="field"> the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field. </param>
        /// <param name="suggestMode">
        /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param>
        /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param>
        /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception>
        /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
        /// <returns> String[] the sorted list of the suggest words with these 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        ///  </returns>
        public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy)
        {
            // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();

            try
            {
                if (ir == null || field == null)
                {
                    suggestMode = SuggestMode.SUGGEST_ALWAYS;
                }
                if (suggestMode == SuggestMode.SUGGEST_ALWAYS)
                {
                    ir    = null;
                    field = null;
                }

                int lengthWord = word.Length;

                int freq     = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0)
                {
                    return(new string[] { word });
                }

                BooleanQuery query = new BooleanQuery();
                string[]     grams;
                string       key;

                for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {
                    key = "gram" + ng;           // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0)                                            // should we boost prefixes?
                    {
                        Add(query, "start" + ng, grams[0], bStart);            // matches start of word
                    }
                    if (bEnd > 0)                                              // should we boost suffixes
                    {
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int         stop    = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {
                    sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if ([email protected](word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.@string);
                    if (sugWord.score < accuracy)
                    {
                        continue;
                    }

                    if (ir != null && field != null)                                 // use the user index
                    {
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }
                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        accuracy = sugQueue.Top().score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                string[] list = new string[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = sugQueue.Pop().@string;
                }

                return(list);
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }
예제 #7
0
        public void TestGetDistance()
        {
            float d = sd.GetDistance("al", "al");

            Assert.IsTrue(d == 1.0f);
            d = sd.GetDistance("martha", "marhta");
            Assert.IsTrue(d > 0.961 && d < 0.962);
            d = sd.GetDistance("jones", "johnson");
            Assert.IsTrue(d > 0.832 && d < 0.833);
            d = sd.GetDistance("abcvwxyz", "cabvwxyz");
            Assert.IsTrue(d > 0.958 && d < 0.959);
            d = sd.GetDistance("dwayne", "duane");
            Assert.IsTrue(d > 0.84 && d < 0.841);
            d = sd.GetDistance("dixon", "dicksonx");
            Assert.IsTrue(d > 0.813 && d < 0.814);
            d = sd.GetDistance("fvie", "ten");
            Assert.IsTrue(d == 0f);
            float d1 = sd.GetDistance("zac ephron", "zac efron");
            float d2 = sd.GetDistance("zac ephron", "kai ephron");

            Assert.IsTrue(d1 > d2);
            d1 = sd.GetDistance("brittney spears", "britney spears");
            d2 = sd.GetDistance("brittney spears", "brittney startzman");
            Assert.IsTrue(d1 > d2);
        }