Stem() public method

Find the stem(s) of the provided word
public Stem ( char word, int length ) : IList
word char Word to find the stems for
length int
return IList
Exemplo n.º 1
0
 // ********************** STEMMER, not used in favour of SqlServer
 // use Hunspell stemmer. I returns or basic word from .DIC or self.
 // for czech, it does not works for some slovesa
 public static void init()
 {
     //var data = file("cs_cz");
     //var data = file("de");
     //var data = file("br_FR");
     foreach (var data in files())
     {
         var encod = encoding.getEncoding(data.Item2);
         var lines = File.ReadAllLines(data.Item1, encod).Skip(1).Where(l => !string.IsNullOrEmpty(l) && char.IsLetter(l[0])).Select(l => l.Split('/')[0]).ToArray();
         using (var dic = File.OpenRead(data.Item1))
             using (var aff = File.OpenRead(data.Item2)) {
                 try {
                     Hunspell.Dictionary dict    = new Hunspell.Dictionary(aff, dic);
                     Hunspell.Stemmer    stemmer = new Hunspell.Stemmer(dict);
                     foreach (var w in lines)
                     {
                         var stems = stemmer.Stem(w);
                         if (stems == null)
                         {
                             continue;
                         }
                         var stemsStr = stems.Select(s => new String(s.Chars));
                     }
                 } catch //(Exception exp)
                 {
                     Console.WriteLine(data.Item1);
                     //throw new Exception(data.Item1, exp);
                 }
             }
     }
     Console.WriteLine("DONE");
     Console.ReadKey();
 }
Exemplo n.º 2
0
        internal static void AssertStemsTo(string s, params string[] expected)
        {
            assertNotNull(stemmer);
            Array.Sort(expected);

            IList <CharsRef> stems = stemmer.Stem(s);

            string[] actual = new string[stems.Count];
            for (int i = 0; i < actual.Length; i++)
            {
                actual[i] = stems[i].ToString();
            }
            Array.Sort(actual);

            // LUCENENET: Use delegate to build the string so we don't have the expensive operation unless there is a failure
            assertArrayEquals(() => "expected=" + Arrays.ToString(expected) + ",actual=" + Arrays.ToString(actual), expected, actual);
        }
Exemplo n.º 3
0
        internal static void AssertStemsTo(string s, params string[] expected)
        {
            assertNotNull(stemmer);
            Array.Sort(expected);

            IList <CharsRef> stems = stemmer.Stem(s);

            string[] actual = new string[stems.Count];
            for (int i = 0; i < actual.Length; i++)
            {
                actual[i] = stems[i].ToString();
            }
            Array.Sort(actual);

            // LUCENENET: Originally, the code was as follows, but it failed to properly compare the arrays.
            //assertArrayEquals("expected=" + Arrays.ToString(expected) + ",actual=" + Arrays.ToString(actual), expected, actual);
            Assert.AreEqual(expected, actual, "expected=" + Arrays.ToString(expected) + ",actual=" + Arrays.ToString(actual));
        }
Exemplo n.º 4
0
        public void Test()
        {
            DirectoryInfo tempDir = CreateTempDir("64kaffixes");
            FileInfo      affix   = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff"));
            FileInfo      dict    = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic"));

            using var affixWriter = new StreamWriter(
                      new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8);

            // 65k affixes with flag 1, then an affix with flag 2
            affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
            for (int i = 0; i < 65536; i++)
            {
                affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
            }
            affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
            affixWriter.Dispose();

            using var dictWriter = new StreamWriter(
                      new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8);


            // drink signed with affix 2 (takes -s)
            dictWriter.Write("1\ndrink/2\n");
            dictWriter.Dispose();

            using Stream affStream  = new FileStream(affix.FullName, FileMode.OpenOrCreate);
            using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate);

            Dictionary dictionary = new Dictionary(affStream, dictStream);
            Stemmer    stemmer    = new Stemmer(dictionary);
            // drinks should still stem to drink
            IList <CharsRef> stems = stemmer.Stem("drinks");

            assertEquals(1, stems.size());
            assertEquals("drink", stems[0].ToString());
        }
Exemplo n.º 5
0
        public override bool IncrementToken()
        {
            if (buffer != null && buffer.Count > 0)
            {
                CharsRef nextStem = buffer[0];
                buffer.RemoveAt(0);
                RestoreState(savedState);
                posIncAtt.PositionIncrement = 0;
                termAtt.SetEmpty().Append(nextStem);
                return(true);
            }

            if (!m_input.IncrementToken())
            {
                return(false);
            }

            if (keywordAtt.IsKeyword)
            {
                return(true);
            }

            buffer = new List <CharsRef>(dedup ? stemmer.UniqueStems(termAtt.Buffer, termAtt.Length) : stemmer.Stem(termAtt.Buffer, termAtt.Length));

            if (buffer.Count == 0) // we do not know this word, return it unchanged
            {
                return(true);
            }

            if (longestOnly && buffer.Count > 1)
            {
                buffer.Sort(lengthComparer);
            }

            CharsRef stem = buffer[0];

            buffer.RemoveAt(0);
            termAtt.SetEmpty().Append(stem);

            if (longestOnly)
            {
                buffer.Clear();
            }
            else
            {
                if (buffer.Count > 0)
                {
                    savedState = CaptureState();
                }
            }

            return(true);
        }