The Diff object generates a patch string.

A patch string is actually a command to a stemmer telling it how to reduce a word to its root. For example, to reduce the word teacher to its root teach the patch string Db would be generated. This command tells the stemmer to delete the last 2 characters from the word teacher to reach the stem (the patch commands are applied starting from the last character in order to save

Esempio n. 1
0
        /// <summary>
        /// Entry point to the DiffIt application.
        /// <para>
        /// This application takes one argument, the path to a file containing a
        /// stemmer table. The program reads the file and generates the patch commands
        /// for the stems.
        /// </para>
        /// </summary>
        /// <param name="args">the path to a file containing a stemmer table</param>
        public static void Main(string[] args)
        {


            int ins = Get(0, args[0]);
            int del = Get(1, args[0]);
            int rep = Get(2, args[0]);
            int nop = Get(3, args[0]);

            for (int i = 1; i < args.Length; i++)
            {
                TextReader @in;
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff(ins, del, rep, nop);
                // LUCENENET TODO: Is using Encoding.UTF8 good enough?
                //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
                @in = new StreamReader(new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8);
                for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
                {
                    try
                    {
                        line = line.ToLowerInvariant();
                        StringTokenizer st = new StringTokenizer(line);
                        string stem = st.NextToken();
                        Console.WriteLine(stem + " -a");
                        while (st.HasMoreTokens())
                        {
                            String token = st.NextToken();
                            if (token.Equals(stem) == false)
                            {
                                Console.WriteLine(stem + " " + diff.Exec(token, stem));
                            }
                        }
                    }
                    catch (InvalidOperationException /*x*/)
                    {
                        // no base token (stem) on a line
                    }
                }
            }
        }
Esempio n. 2
0
        /**
         * Entry point to the Compile application.
         * <p>
         * This program takes any number of arguments: the first is the name of the
         * desired stemming algorithm to use (a list is available in the package
         * description) , all of the rest should be the path or paths to a file or
         * files containing a stemmer table to compile.
         * 
         * @param args the command line arguments
         */
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                return;
            }

            args[0].ToUpperInvariant();

            backward = args[0][0] == '-';
            int qq = (backward) ? 1 : 0;
            bool storeorig = false;

            if (args[0][qq] == '0')
            {
                storeorig = true;
                qq++;
            }

            multi = args[0][qq] == 'M';
            if (multi)
            {
                qq++;
            }

            // LUCENENET TODO: Is this any different than Encoding.UTF8?
            //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");

            char[] optimizer = new char[args[0].Length - qq];
            for (int i = 0; i < optimizer.Length; i++)
            {
                optimizer[i] = args[0][qq + i];
            }

            for (int i = 1; i < args.Length; i++)
            {
                TextReader @in;
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff();
                //int stems = 0; // not used
                int words = 0;


                AllocTrie();

                Console.WriteLine(args[i]);
                using (@in = new StreamReader(
                    new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8))
                {
                    for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
                    {
                        try
                        {
                            line = line.ToLowerInvariant();
                            StringTokenizer st = new StringTokenizer(line);
                            string stem = st.NextToken();
                            if (storeorig)
                            {
                                trie.Add(stem, "-a");
                                words++;
                            }
                            while (st.HasMoreTokens())
                            {
                                string token = st.NextToken();
                                if (token.Equals(stem) == false)
                                {
                                    trie.Add(token, diff.Exec(token, stem));
                                    words++;
                                }
                            }
                        }
                        catch (InvalidOperationException /*x*/)
                        {
                            // no base token (stem) on a line
                        }
                    }
                }

                Optimizer o = new Optimizer();
                Optimizer2 o2 = new Optimizer2();
                Lift l = new Lift(true);
                Lift e = new Lift(false);
                Gener g = new Gener();

                for (int j = 0; j < optimizer.Length; j++)
                {
                    string prefix;
                    switch (optimizer[j])
                    {
                        case 'G':
                            trie = trie.Reduce(g);
                            prefix = "G: ";
                            break;
                        case 'L':
                            trie = trie.Reduce(l);
                            prefix = "L: ";
                            break;
                        case 'E':
                            trie = trie.Reduce(e);
                            prefix = "E: ";
                            break;
                        case '2':
                            trie = trie.Reduce(o2);
                            prefix = "2: ";
                            break;
                        case '1':
                            trie = trie.Reduce(o);
                            prefix = "1: ";
                            break;
                        default:
                            continue;
                    }
                    trie.PrintInfo(System.Console.Out, prefix + " ");
                }

                using (DataOutputStream os = new DataOutputStream(
                    new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
                {
                    os.WriteUTF(args[0]);
                    trie.Store(os);
                }
            }
        }
Esempio n. 3
0
        /// <summary>
        /// Entry point to the Compile application.
        /// <para/>
        /// This program takes any number of arguments: the first is the name of the
        /// desired stemming algorithm to use (a list is available in the package
        /// description) , all of the rest should be the path or paths to a file or
        /// files containing a stemmer table to compile.
        /// </summary>
        /// <param name="args">the command line arguments</param>
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                return;
            }

            // LUCENENET NOTE: This line does nothing in .NET
            // and also does nothing in Java...what?
            //args[0].ToUpperInvariant();

            // Reads the first char of the first arg
            backward = args[0][0] == '-';
            int  qq        = (backward) ? 1 : 0;
            bool storeorig = false;

            if (args[0][qq] == '0')
            {
                storeorig = true;
                qq++;
            }

            multi = args[0][qq] == 'M';
            if (multi)
            {
                qq++;
            }
            // LUCENENET specific - reformatted with : and changed "charset" to "encoding"
            string charset       = SystemProperties.GetProperty("egothor:stemmer:encoding", "UTF-8");
            var    stemmerTables = new JCG.List <string>();

            // LUCENENET specific
            // command line argument overrides environment variable or default, if supplied
            for (int i = 1; i < args.Length; i++)
            {
                if ("-e".Equals(args[i], StringComparison.Ordinal) || "--encoding".Equals(args[i], StringComparison.Ordinal))
                {
                    charset = args[i];
                }
                else
                {
                    stemmerTables.Add(args[i]);
                }
            }

            char[] optimizer = new char[args[0].Length - qq];
            for (int i = 0; i < optimizer.Length; i++)
            {
                optimizer[i] = args[0][qq + i];
            }

            foreach (var stemmerTable in stemmerTables)
            {
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff();
                //int stems = 0; // not used
                int words = 0;


                AllocTrie();

                Console.WriteLine(stemmerTable);
                using (TextReader input = new StreamReader(
                           new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
                {
                    string line;
                    while ((line = input.ReadLine()) != null)
                    {
                        line = line.ToLowerInvariant();
                        using StringTokenizer st = new StringTokenizer(line);
                        if (st.MoveNext())
                        {
                            string stem = st.Current;
                            if (storeorig)
                            {
                                trie.Add(stem, "-a");
                                words++;
                            }
                            while (st.MoveNext())
                            {
                                string token = st.Current;
                                if (token.Equals(stem, StringComparison.Ordinal) == false)
                                {
                                    trie.Add(token, diff.Exec(token, stem));
                                    words++;
                                }
                            }
                        }
                        else // LUCENENET: st.MoveNext() will return false rather than throwing a NoSuchElementException
                        {
                            // no base token (stem) on a line
                        }
                    }
                }

                Optimizer  o  = new Optimizer();
                Optimizer2 o2 = new Optimizer2();
                Lift       l  = new Lift(true);
                Lift       e  = new Lift(false);
                Gener      g  = new Gener();

                for (int j = 0; j < optimizer.Length; j++)
                {
                    string prefix;
                    switch (optimizer[j])
                    {
                    case 'G':
                        trie   = trie.Reduce(g);
                        prefix = "G: ";
                        break;

                    case 'L':
                        trie   = trie.Reduce(l);
                        prefix = "L: ";
                        break;

                    case 'E':
                        trie   = trie.Reduce(e);
                        prefix = "E: ";
                        break;

                    case '2':
                        trie   = trie.Reduce(o2);
                        prefix = "2: ";
                        break;

                    case '1':
                        trie   = trie.Reduce(o);
                        prefix = "1: ";
                        break;

                    default:
                        continue;
                    }
                    trie.PrintInfo(Console.Out, prefix + " ");
                }

                using DataOutputStream os = new DataOutputStream(
                          new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write));
                os.WriteUTF(args[0]);
                trie.Store(os);
            }
        }