/// <summary> /// Optimize (remove empty rows) from the given Trie and return the resulting /// Trie. /// </summary> /// <param name="orig">the <see cref="Trie"/> to consolidate</param> /// <returns>the newly consolidated Trie</returns> public override Trie Optimize(Trie orig) { IList<string> cmds = orig.cmds; IList<Row> rows = new List<Row>(); IList<Row> orows = orig.rows; int[] remap = new int[orows.Count]; for (int j = orows.Count - 1; j >= 0; j--) { Row now = new Remap(orows[j], remap); bool merged = false; for (int i = 0; i < rows.Count; i++) { Row q = Merge(now, rows[i]); if (q != null) { rows[i] = q; merged = true; remap[j] = i; break; } } if (merged == false) { remap[j] = rows.Count; rows.Add(now); } } int root = remap[orig.root]; Arrays.Fill(remap, -1); rows = RemoveGaps(root, rows, new List<Row>(), remap); return new Trie(orig.forward, remap[root], cmds, rows); }
/// <summary> /// Create a Stemmer using pre-loaded stemmer table /// </summary> /// <param name="stemmer">pre-loaded stemmer table</param> public StempelStemmer(Trie stemmer) { this.stemmer = stemmer; }
/// <summary> /// Entry point to the Compile application. /// <para/> /// This program takes any number of arguments: the first is the name of the /// desired stemming algorithm to use (a list is available in the package /// description) , all of the rest should be the path or paths to a file or /// files containing a stemmer table to compile. /// </summary> /// <param name="args">the command line arguments</param> public static void Main(string[] args) { if (args.Length < 1) { return; } // LUCENENET NOTE: This line does nothing in .NET // and also does nothing in Java...what? //args[0].ToUpperInvariant(); // Reads the first char of the first arg backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } string charset = SystemProperties.GetProperty("egothor.stemmer.charset", "UTF-8"); var stemmerTables = new List <string>(); // LUCENENET specific // command line argument overrides environment variable or default, if supplied for (int i = 1; i < args.Length; i++) { if ("-e".Equals(args[i]) || "--encoding".Equals(args[i])) { charset = args[i]; } else { stemmerTables.Add(args[i]); } } char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } foreach (var stemmerTable in stemmerTables) { // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(stemmerTable); using (TextReader input = new StreamReader( new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) { string line; while ((line = input.ReadLine()) != null) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); string stem = st.NextToken(); if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.HasMoreTokens()) { string token = st.NextToken(); if (token.Equals(stem) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(Console.Out, prefix + " "); } using (DataOutputStream os = new DataOutputStream( new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); } } }
static DefaultsHolder() { try { DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer), typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#", #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT); #pragma warning restore 612, 618 } catch (IOException ex) { // default set should always be present as it is part of the // distribution (embedded resource) throw new SystemException("Unable to load default stopword set", ex); } try { DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).Assembly.GetManifestResourceStream( typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STEMMER_FILE)); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (embedded resource) throw new SystemException("Unable to load default stemming tables", ex); } }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion">lucene compatibility version</param> /// <param name="stopwords">a stopword set</param> /// <param name="stemExclusionSet">a set of terms not to be stemmed</param> public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemTable = DefaultsHolder.DEFAULT_TABLE; this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy( matchVersion, stemExclusionSet)); }
internal static Trie LoadTrie(string path) { Trie trie; using (DataInputStream @is = new DataInputStream( new FileStream(path, FileMode.Open, FileAccess.Read))) { string method = @is.ReadUTF().ToUpperInvariant(); if (method.IndexOf('M') < 0) { trie = new Trie(@is); } else { trie = new MultiTrie(@is); } } return trie; }
private static void AssertTrie(Trie trie, string file, bool usefull, bool storeorig) { using (TextReader @in = new StreamReader(new FileStream(file, FileMode.Open), Encoding.UTF8)) { for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); string stem = st.NextToken(); if (storeorig) { string cmd = (usefull) ? trie.GetFully(stem) : trie .GetLastOnPath(stem); StringBuilder stm = new StringBuilder(stem); Diff.Apply(stm, cmd); assertEquals(stem.ToLowerInvariant(), stm.ToString().ToLowerInvariant()); } while (st.HasMoreTokens()) { string token = st.NextToken(); if (token.Equals(stem)) { continue; } string cmd = (usefull) ? trie.GetFully(token) : trie .GetLastOnPath(token); StringBuilder stm = new StringBuilder(token); Diff.Apply(stm, cmd); assertEquals(stem.ToLowerInvariant(), stm.ToString().ToLowerInvariant()); } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } }
internal static void AllocTrie() { if (multi) { trie = new MultiTrie2(!backward); } else { trie = new Trie(!backward); } }
/** * Entry point to the Compile application. * <p> * This program takes any number of arguments: the first is the name of the * desired stemming algorithm to use (a list is available in the package * description) , all of the rest should be the path or paths to a file or * files containing a stemmer table to compile. * * @param args the command line arguments */ public static void Main(string[] args) { if (args.Length < 1) { return; } args[0].ToUpperInvariant(); backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } // LUCENENET TODO: Is this any different than Encoding.UTF8? //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8"); char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } for (int i = 1; i < args.Length; i++) { TextReader @in; // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(args[i]); using (@in = new StreamReader( new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8)) { for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); string stem = st.NextToken(); if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.HasMoreTokens()) { string token = st.NextToken(); if (token.Equals(stem) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(System.Console.Out, prefix + " "); } using (DataOutputStream os = new DataOutputStream( new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); } } }
private static void AssertTrieContents(Trie trie, string[] keys, string[] vals) { Trie[] tries = new Trie[] { trie, trie.Reduce(new Optimizer()), trie.Reduce(new Optimizer2()), trie.Reduce(new Gener()), trie.Reduce(new Lift(true)), trie.Reduce(new Lift(false)) }; foreach (Trie t in tries) { for (int i = 0; i < keys.Length; i++) { assertEquals(vals[i], t.GetFully(keys[i]).ToString()); assertEquals(vals[i], t.GetLastOnPath(keys[i]).ToString()); } } }
public void TestTrieBackwards() { Trie t = new Trie(false); string[] keys = { "a", "ba", "bb", "c" }; string[] vals = { "1", "2", "2", "4" }; for (int i = 0; i < keys.Length; i++) { t.Add(keys[i], vals[i]); } AssertTrieContents(t, keys, vals); }
public void TestTrie() { Trie t = new Trie(true); string[] keys = { "a", "ba", "bb", "c" }; string[] vals = { "1", "2", "2", "4" }; for (int i = 0; i < keys.Length; i++) { t.Add(keys[i], vals[i]); } assertEquals(0, t.root); assertEquals(2, t.rows.Count); assertEquals(3, t.cmds.Count); AssertTrieContents(t, keys, vals); }
/// <summary> /// Return a Trie with infrequent values occurring in the given Trie removed. /// </summary> /// <param name="orig">the Trie to optimize</param> /// <returns>a new optimized Trie</returns> public override Trie Optimize(Trie orig) { IList<string> cmds = orig.cmds; IList<Row> rows = new List<Row>(); IList<Row> orows = orig.rows; int[] remap = new int[orows.Count]; Arrays.Fill(remap, 1); for (int j = orows.Count - 1; j >= 0; j--) { if (Eat(orows[j], remap)) { remap[j] = 0; } } Arrays.Fill(remap, -1); rows = RemoveGaps(orig.root, orows, new List<Row>(), remap); return new Trie(orig.forward, remap[orig.root], cmds, rows); }