Beispiel #1
0
        public void Test_Gh27()
        {
            PolishStemmer stemmer = new PolishStemmer();

            string input = "Nie zabrakło oczywiście wpadek. Największym zaskoczeniem okazał się dla nas strój Katarzyny Zielińskiej, której ewidentnie o coś chodziło, ale wciąż nie wiemy o co.";

            foreach (string t in Regex.Split(input.ToLower(new CultureInfo("pl")), "[\\s\\.\\,]+"))
            {
                Console.Out.WriteLine("> '" + t + "'");
                foreach (WordData wd in stemmer.Lookup(t))
                {
                    Console.Out.WriteLine(
                        "  - " +
                        (wd.GetStem() == null ? "<null>" : wd.GetStem().ToString()) + ", " + wd.GetTag());
                }
                Console.Out.WriteLine();
            }
        }
Beispiel #2
0
        public void TestWordDataFields()
        {
            IStemmer s = new PolishStemmer();

            String           word     = "liga";
            IList <WordData> response = s.Lookup(word);

            assertEquals(2, response.Count);

            HashSet <String> stems = new HashSet <String>();
            HashSet <String> tags  = new HashSet <String>();

            foreach (WordData wd in response)
            {
                stems.Add(wd.GetStem().ToString());
                tags.Add(wd.GetTag().ToString());
                assertSame(word, wd.Word.ToString());
            }
            assertTrue(stems.Contains("ligać"));
            assertTrue(stems.Contains("liga"));
            assertTrue(tags.Contains("subst:sg:nom:f"));
            assertTrue(tags.Contains("verb:fin:sg:ter:imperf:nonrefl+verb:fin:sg:ter:imperf:refl.nonrefl"));

            // Repeat to make sure we get the same values consistently.
            foreach (WordData wd in response)
            {
                stems.Contains(wd.GetStem().ToString());
                tags.Contains(wd.GetTag().ToString());
            }

            //String ENCODING = "UTF-8";
            Encoding ENCODING = Encoding.UTF8;

            // Run the same consistency check for the returned buffers.
            ByteBuffer temp = ByteBuffer.Allocate(100);

            foreach (WordData wd in response)
            {
                // Buffer should be copied.
                ByteBuffer copy = wd.GetStemBytes(null);
                String     stem = ENCODING.GetString(copy.Array, copy.ArrayOffset + copy.Position, copy.Remaining);
                // The buffer should be present in stems set.
                assertTrue(stem, stems.Contains(stem));
                // Buffer large enough to hold the contents.
                assertSame(temp, wd.GetStemBytes(temp));
                // The copy and the clone should be identical.
                assertEquals(0, copy.CompareTo(temp));
            }

            foreach (WordData wd in response)
            {
                // Buffer should be copied.
                ByteBuffer copy = wd.GetTagBytes(null);
                String     tag  = ENCODING.GetString(copy.Array, copy.ArrayOffset + copy.Position, copy.Remaining);
                // The buffer should be present in tags set.
                assertTrue(tag, tags.Contains(tag));
                // Buffer large enough to hold the contents.
                temp.Clear();
                assertSame(temp, wd.GetTagBytes(temp));
                // The copy and the clone should be identical.
                assertEquals(0, copy.CompareTo(temp));
            }

            foreach (WordData wd in response)
            {
                // Buffer should be copied.
                ByteBuffer copy = wd.GetWordBytes(null);
                assertNotNull(copy);
                assertEquals(0, copy.CompareTo(ByteBuffer.Wrap(ENCODING.GetBytes(word))));
            }
        }