private int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, String suffix, bool found) { int unknownWordLength = 0; int[] definition = characterDefinitions.LookupDefinition(category); if (definition[CharacterDefinitions.INVOKE] == 1 || found == false) { if (definition[CharacterDefinitions.GROUP] == 0) { unknownWordLength = 1; } else { unknownWordLength = 1; for (int j = 1; j < suffix.Length; j++) { char c = suffix[j]; int[] categories = characterDefinitions.LookupCategories(c); if (categories == null) { break; } if (i < categories.Length && category == categories[i]) { unknownWordLength++; } else { break; } } } } if (unknownWordLength > 0) { string unkWord = suffix.Substring(0, unknownWordLength); int[] wordIds = unknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same foreach (int wordId in wordIds) { ViterbiNode node = new ViterbiNode(wordId, unkWord, unknownDictionary, startIndex, ViterbiNode.NodeType.UNKNOWN); lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength); } unknownWordEndIndex = startIndex + unknownWordLength; } return(unknownWordEndIndex); }
int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found) { var unknownWordLength = 0; var definition = CharacterDefinitions.LookupDefinition(category); if (definition[CharacterDefinitions.Invoke] == 1 || found == false) { if (definition[CharacterDefinitions.Group] == 0) { unknownWordLength = 1; } else { unknownWordLength = 1; for (var j = 1; j < suffix.Length; j++) { var c = suffix[j]; var categories = CharacterDefinitions.LookupCategories(c); if (categories == null) { break; } if (i < categories.Length && category == categories[i]) { unknownWordLength++; } else { break; } } } } if (unknownWordLength > 0) { var unkWord = suffix.Substring(0, unknownWordLength); var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same foreach (var wordId in wordIds) { var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown); lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength); } unknownWordEndIndex = startIndex + unknownWordLength; } return(unknownWordEndIndex); }
public void TestCostsAndFeatures() { var categories = CharacterDefinitions.LookupCategories('一'); // KANJI & KANJINUMERIC categories.Length.Is(2); categories.SequenceEqual(new int[] { 5, 6 }).IsTrue(); // KANJI entries UnknownDictionary.LookupWordIds(categories[0]).SequenceEqual(new int[] { 2, 3, 4, 5, 6, 7 }).IsTrue(); // KANJI feature variety UnknownDictionary.GetAllFeaturesArray(2).SequenceEqual(new string[] { "名詞", "一般", "*", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(3).SequenceEqual(new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(4).SequenceEqual(new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(5).SequenceEqual(new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue(); // KANJINUMERIC entry UnknownDictionary.LookupWordIds(categories[1]).SequenceEqual(new int[] { 29 }).IsTrue(); // KANJINUMERIC costs UnknownDictionary.GetLeftId(29).Is(1295); UnknownDictionary.GetRightId(29).Is(1295); UnknownDictionary.GetWordCost(29).Is(27473); // KANJINUMERIC features UnknownDictionary.GetAllFeaturesArray(29).SequenceEqual(new string[] { "名詞", "数", "*", "*", "*", "*", "*" }).IsTrue(); }
public void TestCostsAndFeatures() { int[] categories = characterDefinitions.LookupCategories('一'); // KANJI & KANJINUMERIC Assert.AreEqual(2, categories.Length); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual(new int[] { 5, 6 }, categories)); // KANJI entries Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new int[] { 2, 3, 4, 5, 6, 7 }, unknownDictionary.LookupWordIds(categories[0]) )); // KANJI feature variety Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "一般", "*", "*", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(2) )); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(3) )); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(4) )); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(5) )); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(6) )); Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(6) )); // KANJINUMERIC entry Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new int[] { 29 }, unknownDictionary.LookupWordIds(categories[1]) )); // KANJINUMERIC costs Assert.AreEqual(1295, unknownDictionary.GetLeftId(29)); Assert.AreEqual(1295, unknownDictionary.GetRightId(29)); Assert.AreEqual(27473, unknownDictionary.GetWordCost(29)); // KANJINUMERIC features Assert.IsTrue(NLPJDictTest.TestUtils.IsArrayEqual( new string[] { "名詞", "数", "*", "*", "*", "*", "*" }, unknownDictionary.GetAllFeaturesArray(29) )); }