// ********************** STEMMER, not used in favour of SqlServer // use Hunspell stemmer. I returns or basic word from .DIC or self. // for czech, it does not works for some slovesa public static void init() { //var data = file("cs_cz"); //var data = file("de"); //var data = file("br_FR"); foreach (var data in files()) { var encod = encoding.getEncoding(data.Item2); var lines = File.ReadAllLines(data.Item1, encod).Skip(1).Where(l => !string.IsNullOrEmpty(l) && char.IsLetter(l[0])).Select(l => l.Split('/')[0]).ToArray(); using (var dic = File.OpenRead(data.Item1)) using (var aff = File.OpenRead(data.Item2)) { try { Hunspell.Dictionary dict = new Hunspell.Dictionary(aff, dic); Hunspell.Stemmer stemmer = new Hunspell.Stemmer(dict); foreach (var w in lines) { var stems = stemmer.Stem(w); if (stems == null) { continue; } var stemsStr = stems.Select(s => new String(s.Chars)); } } catch //(Exception exp) { Console.WriteLine(data.Item1); //throw new Exception(data.Item1, exp); } } } Console.WriteLine("DONE"); Console.ReadKey(); }
internal static void Init(bool ignoreCase, string affix, params string[] dictionaries) { if (dictionaries.Length == 0) { throw new System.ArgumentException("there must be at least one dictionary"); } System.IO.Stream affixStream = typeof(StemmerTestBase).getResourceAsStream(affix); if (affixStream == null) { throw new FileNotFoundException("file not found: " + affix); } System.IO.Stream[] dictStreams = new System.IO.Stream[dictionaries.Length]; for (int i = 0; i < dictionaries.Length; i++) { dictStreams[i] = typeof(StemmerTestBase).getResourceAsStream(dictionaries[i]); if (dictStreams[i] == null) { throw new FileNotFoundException("file not found: " + dictStreams[i]); } } try { Dictionary dictionary = new Dictionary(affixStream, Arrays.AsList(dictStreams), ignoreCase); stemmer = new Stemmer(dictionary); } finally { IOUtils.CloseWhileHandlingException(affixStream); IOUtils.CloseWhileHandlingException(null, dictStreams); } }
/// <summary> /// Creates a new HunspellStemFilter that will stem tokens from the given <see cref="TokenStream"/> using affix rules in the provided /// Dictionary /// </summary> /// <param name="input"> <see cref="TokenStream"/> whose tokens will be stemmed </param> /// <param name="dictionary"> Hunspell <see cref="Dictionary"/> containing the affix rules and words that will be used to stem the tokens </param> /// <param name="dedup"> remove duplicates </param> /// <param name="longestOnly"> true if only the longest term should be output. </param> public HunspellStemFilter(TokenStream input, Dictionary dictionary, bool dedup, bool longestOnly) : base(input) { this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set this.stemmer = new Stemmer(dictionary); this.longestOnly = longestOnly; termAtt = AddAttribute <ICharTermAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); keywordAtt = AddAttribute <IKeywordAttribute>(); }
public void Test() { DirectoryInfo tempDir = CreateTempDir("64kaffixes"); FileInfo affix = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff")); FileInfo dict = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic")); using var affixWriter = new StreamWriter( new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8); // 65k affixes with flag 1, then an affix with flag 2 affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n"); for (int i = 0; i < 65536; i++) { affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n"); } affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n"); affixWriter.Dispose(); using var dictWriter = new StreamWriter( new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8); // drink signed with affix 2 (takes -s) dictWriter.Write("1\ndrink/2\n"); dictWriter.Dispose(); using Stream affStream = new FileStream(affix.FullName, FileMode.OpenOrCreate); using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate); Dictionary dictionary = new Dictionary(affStream, dictStream); Stemmer stemmer = new Stemmer(dictionary); // drinks should still stem to drink IList <CharsRef> stems = stemmer.Stem("drinks"); assertEquals(1, stems.size()); assertEquals("drink", stems[0].ToString()); }