private int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, String suffix, bool found)
        {
            int unknownWordLength = 0;

            int[] definition = characterDefinitions.LookupDefinition(category);

            if (definition[CharacterDefinitions.INVOKE] == 1 || found == false)
            {
                if (definition[CharacterDefinitions.GROUP] == 0)
                {
                    unknownWordLength = 1;
                }
                else
                {
                    unknownWordLength = 1;
                    for (int j = 1; j < suffix.Length; j++)
                    {
                        char c = suffix[j];

                        int[] categories = characterDefinitions.LookupCategories(c);

                        if (categories == null)
                        {
                            break;
                        }

                        if (i < categories.Length && category == categories[i])
                        {
                            unknownWordLength++;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (unknownWordLength > 0)
            {
                string unkWord = suffix.Substring(0, unknownWordLength);
                int[]  wordIds = unknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same

                foreach (int wordId in wordIds)
                {
                    ViterbiNode node = new ViterbiNode(wordId, unkWord, unknownDictionary, startIndex, ViterbiNode.NodeType.UNKNOWN);
                    lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
                }
                unknownWordEndIndex = startIndex + unknownWordLength;
            }

            return(unknownWordEndIndex);
        }
Esempio n. 2
0
        int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found)
        {
            var unknownWordLength = 0;
            var definition        = CharacterDefinitions.LookupDefinition(category);

            if (definition[CharacterDefinitions.Invoke] == 1 || found == false)
            {
                if (definition[CharacterDefinitions.Group] == 0)
                {
                    unknownWordLength = 1;
                }
                else
                {
                    unknownWordLength = 1;
                    for (var j = 1; j < suffix.Length; j++)
                    {
                        var c = suffix[j];

                        var categories = CharacterDefinitions.LookupCategories(c);

                        if (categories == null)
                        {
                            break;
                        }

                        if (i < categories.Length && category == categories[i])
                        {
                            unknownWordLength++;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (unknownWordLength > 0)
            {
                var unkWord = suffix.Substring(0, unknownWordLength);
                var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same

                foreach (var wordId in wordIds)
                {
                    var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown);
                    lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
                }
                unknownWordEndIndex = startIndex + unknownWordLength;
            }

            return(unknownWordEndIndex);
        }
        public void TestCostsAndFeatures()
        {
            var categories = CharacterDefinitions.LookupCategories('一');

            // KANJI & KANJINUMERIC
            categories.Length.Is(2);

            categories.SequenceEqual(new int[] { 5, 6 }).IsTrue();

            // KANJI entries
            UnknownDictionary.LookupWordIds(categories[0]).SequenceEqual(new int[] { 2, 3, 4, 5, 6, 7 }).IsTrue();

            // KANJI feature variety
            UnknownDictionary.GetAllFeaturesArray(2).SequenceEqual(new string[] { "名詞", "一般", "*", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(3).SequenceEqual(new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(4).SequenceEqual(new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(5).SequenceEqual(new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue();

            // KANJINUMERIC entry
            UnknownDictionary.LookupWordIds(categories[1]).SequenceEqual(new int[] { 29 }).IsTrue();

            // KANJINUMERIC costs
            UnknownDictionary.GetLeftId(29).Is(1295);
            UnknownDictionary.GetRightId(29).Is(1295);
            UnknownDictionary.GetWordCost(29).Is(27473);

            // KANJINUMERIC features
            UnknownDictionary.GetAllFeaturesArray(29).SequenceEqual(new string[] { "名詞", "数", "*", "*", "*", "*", "*" }).IsTrue();
        }
        public void TestCostsAndFeatures()
        {
            int[] categories = characterDefinitions.LookupCategories('一');

            // KANJI & KANJINUMERIC
            Assert.AreEqual(2, categories.Length);

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(new int[] { 5, 6 }, categories));

            // KANJI entries
            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new int[] { 2, 3, 4, 5, 6, 7 },
                              unknownDictionary.LookupWordIds(categories[0])
                              ));

            // KANJI feature variety
            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "一般", "*", "*", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(2)
                              ));

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(3)
                              ));

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(4)
                              ));

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(5)
                              ));

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(6)
                              ));

            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(6)
                              ));

            // KANJINUMERIC entry
            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new int[] { 29 },
                              unknownDictionary.LookupWordIds(categories[1])
                              ));

            // KANJINUMERIC costs
            Assert.AreEqual(1295, unknownDictionary.GetLeftId(29));
            Assert.AreEqual(1295, unknownDictionary.GetRightId(29));
            Assert.AreEqual(27473, unknownDictionary.GetWordCost(29));

            // KANJINUMERIC features
            Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(
                              new string[] { "名詞", "数", "*", "*", "*", "*", "*" },
                              unknownDictionary.GetAllFeaturesArray(29)
                              ));
        }