Lift's raison d'etre is to implement reduction of the trie via the Lift-Up method., which makes the data structure less liable to overstemming.
/** * Entry point to the Compile application. * <p> * This program takes any number of arguments: the first is the name of the * desired stemming algorithm to use (a list is available in the package * description) , all of the rest should be the path or paths to a file or * files containing a stemmer table to compile. * * @param args the command line arguments */ public static void Main(string[] args) { if (args.Length < 1) { return; } args[0].ToUpperInvariant(); backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } // LUCENENET TODO: Is this any different than Encoding.UTF8? //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8"); char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } for (int i = 1; i < args.Length; i++) { TextReader @in; // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(args[i]); using (@in = new StreamReader( new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8)) { for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); string stem = st.NextToken(); if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.HasMoreTokens()) { string token = st.NextToken(); if (token.Equals(stem) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(System.Console.Out, prefix + " "); } using (DataOutputStream os = new DataOutputStream( new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); } } }
/// <summary> /// Entry point to the Compile application. /// <para/> /// This program takes any number of arguments: the first is the name of the /// desired stemming algorithm to use (a list is available in the package /// description) , all of the rest should be the path or paths to a file or /// files containing a stemmer table to compile. /// </summary> /// <param name="args">the command line arguments</param> public static void Main(string[] args) { if (args.Length < 1) { return; } // LUCENENET NOTE: This line does nothing in .NET // and also does nothing in Java...what? //args[0].ToUpperInvariant(); // Reads the first char of the first arg backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } // LUCENENET specific - reformatted with : string charset = SystemProperties.GetProperty("egothor:stemmer:charset", "UTF-8"); var stemmerTables = new List <string>(); // LUCENENET specific // command line argument overrides environment variable or default, if supplied for (int i = 1; i < args.Length; i++) { if ("-e".Equals(args[i], StringComparison.Ordinal) || "--encoding".Equals(args[i], StringComparison.Ordinal)) { charset = args[i]; } else { stemmerTables.Add(args[i]); } } char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } foreach (var stemmerTable in stemmerTables) { // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(stemmerTable); using (TextReader input = new StreamReader( new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) { string line; while ((line = input.ReadLine()) != null) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); st.MoveNext(); string stem = st.Current; if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.MoveNext()) { string token = st.Current; if (token.Equals(stem, StringComparison.Ordinal) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(Console.Out, prefix + " "); } using (DataOutputStream os = new DataOutputStream( new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); } } }