Пример #1
0
        /// <summary>
        /// Optimize (remove empty rows) from the given Trie and return the resulting
        /// Trie.
        /// </summary>
        /// <param name="orig">the <see cref="Trie"/> to consolidate</param>
        /// <returns>the newly consolidated Trie</returns>
        public override Trie Optimize(Trie orig)
        {
            IList<string> cmds = orig.cmds;
            IList<Row> rows = new List<Row>();
            IList<Row> orows = orig.rows;
            int[] remap = new int[orows.Count];

            for (int j = orows.Count - 1; j >= 0; j--)
            {
                Row now = new Remap(orows[j], remap);
                bool merged = false;

                for (int i = 0; i < rows.Count; i++)
                {
                    Row q = Merge(now, rows[i]);
                    if (q != null)
                    {
                        rows[i] = q;
                        merged = true;
                        remap[j] = i;
                        break;
                    }
                }

                if (merged == false)
                {
                    remap[j] = rows.Count;
                    rows.Add(now);
                }
            }

            int root = remap[orig.root];
            Arrays.Fill(remap, -1);
            rows = RemoveGaps(root, rows, new List<Row>(), remap);

            return new Trie(orig.forward, remap[root], cmds, rows);
        }
Пример #2
0
 /// <summary>
 /// Create a Stemmer using pre-loaded stemmer table
 /// </summary>
 /// <param name="stemmer">pre-loaded stemmer table</param>
 public StempelStemmer(Trie stemmer)
 {
     this.stemmer = stemmer;
 }
Пример #3
0
        /// <summary>
        /// Entry point to the Compile application.
        /// <para/>
        /// This program takes any number of arguments: the first is the name of the
        /// desired stemming algorithm to use (a list is available in the package
        /// description) , all of the rest should be the path or paths to a file or
        /// files containing a stemmer table to compile.
        /// </summary>
        /// <param name="args">the command line arguments</param>
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                return;
            }

            // LUCENENET NOTE: This line does nothing in .NET
            // and also does nothing in Java...what?
            //args[0].ToUpperInvariant();

            // Reads the first char of the first arg
            backward = args[0][0] == '-';
            int  qq        = (backward) ? 1 : 0;
            bool storeorig = false;

            if (args[0][qq] == '0')
            {
                storeorig = true;
                qq++;
            }

            multi = args[0][qq] == 'M';
            if (multi)
            {
                qq++;
            }

            string charset       = SystemProperties.GetProperty("egothor.stemmer.charset", "UTF-8");
            var    stemmerTables = new List <string>();

            // LUCENENET specific
            // command line argument overrides environment variable or default, if supplied
            for (int i = 1; i < args.Length; i++)
            {
                if ("-e".Equals(args[i]) || "--encoding".Equals(args[i]))
                {
                    charset = args[i];
                }
                else
                {
                    stemmerTables.Add(args[i]);
                }
            }

            char[] optimizer = new char[args[0].Length - qq];
            for (int i = 0; i < optimizer.Length; i++)
            {
                optimizer[i] = args[0][qq + i];
            }

            foreach (var stemmerTable in stemmerTables)
            {
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff();
                //int stems = 0; // not used
                int words = 0;


                AllocTrie();

                Console.WriteLine(stemmerTable);
                using (TextReader input = new StreamReader(
                           new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)))
                {
                    string line;
                    while ((line = input.ReadLine()) != null)
                    {
                        try
                        {
                            line = line.ToLowerInvariant();
                            StringTokenizer st   = new StringTokenizer(line);
                            string          stem = st.NextToken();
                            if (storeorig)
                            {
                                trie.Add(stem, "-a");
                                words++;
                            }
                            while (st.HasMoreTokens())
                            {
                                string token = st.NextToken();
                                if (token.Equals(stem) == false)
                                {
                                    trie.Add(token, diff.Exec(token, stem));
                                    words++;
                                }
                            }
                        }
                        catch (InvalidOperationException /*x*/)
                        {
                            // no base token (stem) on a line
                        }
                    }
                }

                Optimizer  o  = new Optimizer();
                Optimizer2 o2 = new Optimizer2();
                Lift       l  = new Lift(true);
                Lift       e  = new Lift(false);
                Gener      g  = new Gener();

                for (int j = 0; j < optimizer.Length; j++)
                {
                    string prefix;
                    switch (optimizer[j])
                    {
                    case 'G':
                        trie   = trie.Reduce(g);
                        prefix = "G: ";
                        break;

                    case 'L':
                        trie   = trie.Reduce(l);
                        prefix = "L: ";
                        break;

                    case 'E':
                        trie   = trie.Reduce(e);
                        prefix = "E: ";
                        break;

                    case '2':
                        trie   = trie.Reduce(o2);
                        prefix = "2: ";
                        break;

                    case '1':
                        trie   = trie.Reduce(o);
                        prefix = "1: ";
                        break;

                    default:
                        continue;
                    }
                    trie.PrintInfo(Console.Out, prefix + " ");
                }

                using (DataOutputStream os = new DataOutputStream(
                           new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
                {
                    os.WriteUTF(args[0]);
                    trie.Store(os);
                }
            }
        }
Пример #4
0
            static DefaultsHolder()
            {
                try
                {
                    DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer),
                        typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#",
#pragma warning disable 612, 618
                        LuceneVersion.LUCENE_CURRENT);
#pragma warning restore 612, 618
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new SystemException("Unable to load default stopword set", ex);
                }

                try
                {
                    DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).Assembly.GetManifestResourceStream(
                        typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STEMMER_FILE));
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new SystemException("Unable to load default stemming tables", ex);
                }
            }
Пример #5
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion">lucene compatibility version</param>
 /// <param name="stopwords">a stopword set</param>
 /// <param name="stemExclusionSet">a set of terms not to be stemmed</param>
 public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemTable = DefaultsHolder.DEFAULT_TABLE;
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(
         matchVersion, stemExclusionSet));
 }
Пример #6
0
 internal static Trie LoadTrie(string path)
 {
     Trie trie;
     using (DataInputStream @is = new DataInputStream(
         new FileStream(path, FileMode.Open, FileAccess.Read)))
     {
         string method = @is.ReadUTF().ToUpperInvariant();
         if (method.IndexOf('M') < 0)
         {
             trie = new Trie(@is);
         }
         else
         {
             trie = new MultiTrie(@is);
         }
     }
     return trie;
 }
Пример #7
0
        private static void AssertTrie(Trie trie, string file, bool usefull,
            bool storeorig)
        {
            using (TextReader @in =
                new StreamReader(new FileStream(file, FileMode.Open), Encoding.UTF8))
            {

                for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
                {
                    try
                    {
                        line = line.ToLowerInvariant();
                        StringTokenizer st = new StringTokenizer(line);
                        string stem = st.NextToken();
                        if (storeorig)
                        {
                            string cmd = (usefull) ? trie.GetFully(stem) : trie
                                .GetLastOnPath(stem);
                            StringBuilder stm = new StringBuilder(stem);
                            Diff.Apply(stm, cmd);
                            assertEquals(stem.ToLowerInvariant(), stm.ToString().ToLowerInvariant());
                        }
                        while (st.HasMoreTokens())
                        {
                            string token = st.NextToken();
                            if (token.Equals(stem))
                            {
                                continue;
                            }
                            string cmd = (usefull) ? trie.GetFully(token) : trie
                                .GetLastOnPath(token);
                            StringBuilder stm = new StringBuilder(token);
                            Diff.Apply(stm, cmd);
                            assertEquals(stem.ToLowerInvariant(), stm.ToString().ToLowerInvariant());
                        }
                    }
                    catch (InvalidOperationException /*x*/)
                    {
                        // no base token (stem) on a line
                    }
                }

            }
        }
Пример #8
0
 internal static void AllocTrie()
 {
     if (multi)
     {
         trie = new MultiTrie2(!backward);
     }
     else
     {
         trie = new Trie(!backward);
     }
 }
Пример #9
0
        /**
         * Entry point to the Compile application.
         * <p>
         * This program takes any number of arguments: the first is the name of the
         * desired stemming algorithm to use (a list is available in the package
         * description) , all of the rest should be the path or paths to a file or
         * files containing a stemmer table to compile.
         * 
         * @param args the command line arguments
         */
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                return;
            }

            args[0].ToUpperInvariant();

            backward = args[0][0] == '-';
            int qq = (backward) ? 1 : 0;
            bool storeorig = false;

            if (args[0][qq] == '0')
            {
                storeorig = true;
                qq++;
            }

            multi = args[0][qq] == 'M';
            if (multi)
            {
                qq++;
            }

            // LUCENENET TODO: Is this any different than Encoding.UTF8?
            //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");

            char[] optimizer = new char[args[0].Length - qq];
            for (int i = 0; i < optimizer.Length; i++)
            {
                optimizer[i] = args[0][qq + i];
            }

            for (int i = 1; i < args.Length; i++)
            {
                TextReader @in;
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff();
                //int stems = 0; // not used
                int words = 0;


                AllocTrie();

                Console.WriteLine(args[i]);
                using (@in = new StreamReader(
                    new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8))
                {
                    for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
                    {
                        try
                        {
                            line = line.ToLowerInvariant();
                            StringTokenizer st = new StringTokenizer(line);
                            string stem = st.NextToken();
                            if (storeorig)
                            {
                                trie.Add(stem, "-a");
                                words++;
                            }
                            while (st.HasMoreTokens())
                            {
                                string token = st.NextToken();
                                if (token.Equals(stem) == false)
                                {
                                    trie.Add(token, diff.Exec(token, stem));
                                    words++;
                                }
                            }
                        }
                        catch (InvalidOperationException /*x*/)
                        {
                            // no base token (stem) on a line
                        }
                    }
                }

                Optimizer o = new Optimizer();
                Optimizer2 o2 = new Optimizer2();
                Lift l = new Lift(true);
                Lift e = new Lift(false);
                Gener g = new Gener();

                for (int j = 0; j < optimizer.Length; j++)
                {
                    string prefix;
                    switch (optimizer[j])
                    {
                        case 'G':
                            trie = trie.Reduce(g);
                            prefix = "G: ";
                            break;
                        case 'L':
                            trie = trie.Reduce(l);
                            prefix = "L: ";
                            break;
                        case 'E':
                            trie = trie.Reduce(e);
                            prefix = "E: ";
                            break;
                        case '2':
                            trie = trie.Reduce(o2);
                            prefix = "2: ";
                            break;
                        case '1':
                            trie = trie.Reduce(o);
                            prefix = "1: ";
                            break;
                        default:
                            continue;
                    }
                    trie.PrintInfo(System.Console.Out, prefix + " ");
                }

                using (DataOutputStream os = new DataOutputStream(
                    new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
                {
                    os.WriteUTF(args[0]);
                    trie.Store(os);
                }
            }
        }
Пример #10
0
        private static void AssertTrieContents(Trie trie, string[] keys, string[] vals)
        {
            Trie[] tries = new Trie[] {
                trie,
                trie.Reduce(new Optimizer()),
                trie.Reduce(new Optimizer2()),
                trie.Reduce(new Gener()),
                trie.Reduce(new Lift(true)),
                trie.Reduce(new Lift(false))
            };

            foreach (Trie t in tries)
            {
                for (int i = 0; i < keys.Length; i++)
                {
                    assertEquals(vals[i], t.GetFully(keys[i]).ToString());
                    assertEquals(vals[i], t.GetLastOnPath(keys[i]).ToString());
                }
            }
        }
Пример #11
0
        public void TestTrieBackwards()
        {
            Trie t = new Trie(false);

            string[] keys = { "a", "ba", "bb", "c" };
            string[] vals = { "1", "2", "2", "4" };

            for (int i = 0; i < keys.Length; i++)
            {
                t.Add(keys[i], vals[i]);
            }

            AssertTrieContents(t, keys, vals);
        }
Пример #12
0
        public void TestTrie()
        {
            Trie t = new Trie(true);

            string[] keys = { "a", "ba", "bb", "c" };
            string[] vals = { "1", "2", "2", "4" };

            for (int i = 0; i < keys.Length; i++)
            {
                t.Add(keys[i], vals[i]);
            }

            assertEquals(0, t.root);
            assertEquals(2, t.rows.Count);
            assertEquals(3, t.cmds.Count);
            AssertTrieContents(t, keys, vals);
        }
Пример #13
0
        /// <summary>
        /// Return a Trie with infrequent values occurring in the given Trie removed.
        /// </summary>
        /// <param name="orig">the Trie to optimize</param>
        /// <returns>a new optimized Trie</returns>
        public override Trie Optimize(Trie orig)
        {
            IList<string> cmds = orig.cmds;
            IList<Row> rows = new List<Row>();
            IList<Row> orows = orig.rows;
            int[] remap = new int[orows.Count];

            Arrays.Fill(remap, 1);
            for (int j = orows.Count - 1; j >= 0; j--)
            {
                if (Eat(orows[j], remap))
                {
                    remap[j] = 0;
                }
            }

            Arrays.Fill(remap, -1);
            rows = RemoveGaps(orig.root, orows, new List<Row>(), remap);

            return new Trie(orig.forward, remap[orig.root], cmds, rows);
        }
Пример #14
0
        /**
         * Entry point to the Compile application.
         * <p>
         * This program takes any number of arguments: the first is the name of the
         * desired stemming algorithm to use (a list is available in the package
         * description) , all of the rest should be the path or paths to a file or
         * files containing a stemmer table to compile.
         *
         * @param args the command line arguments
         */
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                return;
            }

            args[0].ToUpperInvariant();

            backward = args[0][0] == '-';
            int  qq        = (backward) ? 1 : 0;
            bool storeorig = false;

            if (args[0][qq] == '0')
            {
                storeorig = true;
                qq++;
            }

            multi = args[0][qq] == 'M';
            if (multi)
            {
                qq++;
            }

            // LUCENENET TODO: Is this any different than Encoding.UTF8?
            //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");

            char[] optimizer = new char[args[0].Length - qq];
            for (int i = 0; i < optimizer.Length; i++)
            {
                optimizer[i] = args[0][qq + i];
            }

            for (int i = 1; i < args.Length; i++)
            {
                TextReader @in;
                // System.out.println("[" + args[i] + "]");
                Diff diff = new Diff();
                //int stems = 0; // not used
                int words = 0;


                AllocTrie();

                Console.WriteLine(args[i]);
                using (@in = new StreamReader(
                           new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8))
                {
                    for (string line = @in.ReadLine(); line != null; line = @in.ReadLine())
                    {
                        try
                        {
                            line = line.ToLowerInvariant();
                            StringTokenizer st   = new StringTokenizer(line);
                            string          stem = st.NextToken();
                            if (storeorig)
                            {
                                trie.Add(stem, "-a");
                                words++;
                            }
                            while (st.HasMoreTokens())
                            {
                                string token = st.NextToken();
                                if (token.Equals(stem) == false)
                                {
                                    trie.Add(token, diff.Exec(token, stem));
                                    words++;
                                }
                            }
                        }
                        catch (InvalidOperationException /*x*/)
                        {
                            // no base token (stem) on a line
                        }
                    }
                }

                Optimizer  o  = new Optimizer();
                Optimizer2 o2 = new Optimizer2();
                Lift       l  = new Lift(true);
                Lift       e  = new Lift(false);
                Gener      g  = new Gener();

                for (int j = 0; j < optimizer.Length; j++)
                {
                    string prefix;
                    switch (optimizer[j])
                    {
                    case 'G':
                        trie   = trie.Reduce(g);
                        prefix = "G: ";
                        break;

                    case 'L':
                        trie   = trie.Reduce(l);
                        prefix = "L: ";
                        break;

                    case 'E':
                        trie   = trie.Reduce(e);
                        prefix = "E: ";
                        break;

                    case '2':
                        trie   = trie.Reduce(o2);
                        prefix = "2: ";
                        break;

                    case '1':
                        trie   = trie.Reduce(o);
                        prefix = "1: ";
                        break;

                    default:
                        continue;
                    }
                    trie.PrintInfo(System.Console.Out, prefix + " ");
                }

                using (DataOutputStream os = new DataOutputStream(
                           new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write)))
                {
                    os.WriteUTF(args[0]);
                    trie.Store(os);
                }
            }
        }