Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
コード例 #1
0
ファイル: dicts.cs プロジェクト: reactxx/rewise
 // ********************** STEMMER, not used in favour of SqlServer
 // use Hunspell stemmer. I returns or basic word from .DIC or self.
 // for czech, it does not works for some slovesa
 public static void init()
 {
     //var data = file("cs_cz");
     //var data = file("de");
     //var data = file("br_FR");
     foreach (var data in files())
     {
         var encod = encoding.getEncoding(data.Item2);
         var lines = File.ReadAllLines(data.Item1, encod).Skip(1).Where(l => !string.IsNullOrEmpty(l) && char.IsLetter(l[0])).Select(l => l.Split('/')[0]).ToArray();
         using (var dic = File.OpenRead(data.Item1))
             using (var aff = File.OpenRead(data.Item2)) {
                 try {
                     Hunspell.Dictionary dict    = new Hunspell.Dictionary(aff, dic);
                     Hunspell.Stemmer    stemmer = new Hunspell.Stemmer(dict);
                     foreach (var w in lines)
                     {
                         var stems = stemmer.Stem(w);
                         if (stems == null)
                         {
                             continue;
                         }
                         var stemsStr = stems.Select(s => new String(s.Chars));
                     }
                 } catch //(Exception exp)
                 {
                     Console.WriteLine(data.Item1);
                     //throw new Exception(data.Item1, exp);
                 }
             }
     }
     Console.WriteLine("DONE");
     Console.ReadKey();
 }
コード例 #2
0
        internal static void Init(bool ignoreCase, string affix, params string[] dictionaries)
        {
            if (dictionaries.Length == 0)
            {
                throw new System.ArgumentException("there must be at least one dictionary");
            }

            System.IO.Stream affixStream = typeof(StemmerTestBase).getResourceAsStream(affix);
            if (affixStream == null)
            {
                throw new FileNotFoundException("file not found: " + affix);
            }

            System.IO.Stream[] dictStreams = new System.IO.Stream[dictionaries.Length];
            for (int i = 0; i < dictionaries.Length; i++)
            {
                dictStreams[i] = typeof(StemmerTestBase).getResourceAsStream(dictionaries[i]);
                if (dictStreams[i] == null)
                {
                    throw new FileNotFoundException("file not found: " + dictStreams[i]);
                }
            }

            try
            {
                Dictionary dictionary = new Dictionary(affixStream, Arrays.AsList(dictStreams), ignoreCase);
                stemmer = new Stemmer(dictionary);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(affixStream);
                IOUtils.CloseWhileHandlingException(null, dictStreams);
            }
        }
コード例 #3
0
 /// <summary>
 /// Creates a new HunspellStemFilter that will stem tokens from the given <see cref="TokenStream"/> using affix rules in the provided
 /// Dictionary
 /// </summary>
 /// <param name="input"> <see cref="TokenStream"/> whose tokens will be stemmed </param>
 /// <param name="dictionary"> Hunspell <see cref="Dictionary"/> containing the affix rules and words that will be used to stem the tokens </param>
 /// <param name="dedup"> remove duplicates </param>
 /// <param name="longestOnly"> true if only the longest term should be output. </param>
 public HunspellStemFilter(TokenStream input, Dictionary dictionary, bool dedup, bool longestOnly)
     : base(input)
 {
     this.dedup       = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
     this.stemmer     = new Stemmer(dictionary);
     this.longestOnly = longestOnly;
     termAtt          = AddAttribute <ICharTermAttribute>();
     posIncAtt        = AddAttribute <IPositionIncrementAttribute>();
     keywordAtt       = AddAttribute <IKeywordAttribute>();
 }
コード例 #4
0
ファイル: Test64kAffixes.cs プロジェクト: ywscr/lucenenet
        public void Test()
        {
            DirectoryInfo tempDir = CreateTempDir("64kaffixes");
            FileInfo      affix   = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff"));
            FileInfo      dict    = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic"));

            using var affixWriter = new StreamWriter(
                      new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8);

            // 65k affixes with flag 1, then an affix with flag 2
            affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
            for (int i = 0; i < 65536; i++)
            {
                affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
            }
            affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
            affixWriter.Dispose();

            using var dictWriter = new StreamWriter(
                      new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8);


            // drink signed with affix 2 (takes -s)
            dictWriter.Write("1\ndrink/2\n");
            dictWriter.Dispose();

            using Stream affStream  = new FileStream(affix.FullName, FileMode.OpenOrCreate);
            using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate);

            Dictionary dictionary = new Dictionary(affStream, dictStream);
            Stemmer    stemmer    = new Stemmer(dictionary);
            // drinks should still stem to drink
            IList <CharsRef> stems = stemmer.Stem("drinks");

            assertEquals(1, stems.size());
            assertEquals("drink", stems[0].ToString());
        }