public static void Setup(TestContext context) { using (var charDefOutput = new MemoryStream()) using (var unkDicOutput = new MemoryStream()) using (var charDefResource = GetResource("char.def")) using (var unkDefResource = GetResource("unk.def")) { var charDefCompiler = new CharacterDefinitionsCompiler(charDefOutput); charDefCompiler.ReadCharacterDefinition(charDefResource, Encoding.GetEncoding("euc-jp")); charDefCompiler.Compile(); var categoryMap = charDefCompiler.MakeCharacterCategoryMap(); var unkDefCompiler = new UnknownDictionaryCompiler(unkDicOutput, categoryMap); unkDefCompiler.ReadUnknownDefinition(unkDefResource, Encoding.GetEncoding("euc-jp")); unkDefCompiler.Compile(); charDefOutput.Seek(0, SeekOrigin.Begin); unkDicOutput.Seek(0, SeekOrigin.Begin); var definitions = IntArrayIO.ReadSparseArray2D(charDefOutput); var mappings = IntArrayIO.ReadSparseArray2D(charDefOutput); var symbols = StringArrayIO.ReadArray(charDefOutput); CharacterDefinitions = new CharacterDefinitions(definitions, mappings, symbols); Costs = IntArrayIO.ReadArray2D(unkDicOutput); References = IntArrayIO.ReadArray2D(unkDicOutput); Features = StringArrayIO.ReadArray2D(unkDicOutput); UnknownDictionary = new UnknownDictionary(CharacterDefinitions, References, Costs, Features); } }
public override void LoadDictionaries() { Penalties = new List <int>(); Penalties.Add(kanjiPenaltyLengthTreshold); Penalties.Add(kanjiPenalty); Penalties.Add(otherPenaltyLengthThreshold); Penalties.Add(otherPenalty); try { Fst = FST.NewInstance(AbsoluteFolderPath); ConnectionCosts = ConnectionCosts.NewInstance(AbsoluteFolderPath); TokenInfoDictionary = TokenInfoDictionary.NewInstance(AbsoluteFolderPath); CharacterDefinitions = CharacterDefinitions.NewInstance(AbsoluteFolderPath); if (IsSplitOnNakaguro) { CharacterDefinitions.SetCategories('・', new string[] { "SYMBOL" }); } UnknownDictionary = UnknownDictionary.NewInstance(AbsoluteFolderPath, CharacterDefinitions, totalFeatures); InsertedDictionary = new InsertedDictionary(totalFeatures); } catch (Exception ouch) { throw new Exception("Could not load dictionaries: " + ouch.Message); } }
protected void Configure <V, K>(V builder) where V : Builder <K> where K : TokenizerBase <T> { builder.LoadDictionaries(); this.tokenFactory = builder.TokenFactory; this.tokenInfoDictionary = builder.TokenInfoDictionary; this.unknownDictionary = builder.UnknownDictionary; this.userDictionary = builder.UserDictionary; this.insertedDictionary = builder.InsertedDictionary; this.viterbiBuilder = new ViterbiBuilder( builder.Fst, tokenInfoDictionary, unknownDictionary, userDictionary, builder.Mode ); this.viterbiSearcher = new ViterbiSearcher( builder.Mode, builder.ConnectionCosts, unknownDictionary, builder.Penalties ); this.viterbiFormatter = new ViterbiFormatter(builder.ConnectionCosts); this.split = builder.Split; InitDictionaryMap(); }
protected internal virtual void LoadDictionaries() { DoubleArrayTrie = DoubleArrayTrie.NewInstance(Resolver); ConnectionCosts = ConnectionCosts.NewInstance(Resolver); TokenInfoDictionary = TokenInfoDictionary.NewInstance(Resolver); CharacterDefinitions = CharacterDefinitions.NewInstance(Resolver); UnknownDictionary = UnknownDictionary.NewInstance(Resolver, CharacterDefinitions, TotalFeatures); InsertedDictionary = new InsertedDictionary(TotalFeatures); }
int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found) { var unknownWordLength = 0; var definition = CharacterDefinitions.LookupDefinition(category); if (definition[CharacterDefinitions.Invoke] == 1 || found == false) { if (definition[CharacterDefinitions.Group] == 0) { unknownWordLength = 1; } else { unknownWordLength = 1; for (var j = 1; j < suffix.Length; j++) { var c = suffix[j]; var categories = CharacterDefinitions.LookupCategories(c); if (categories == null) { break; } if (i < categories.Length && category == categories[i]) { unknownWordLength++; } else { break; } } } } if (unknownWordLength > 0) { var unkWord = suffix.Substring(0, unknownWordLength); var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same foreach (var wordId in wordIds) { var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown); lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength); } unknownWordEndIndex = startIndex + unknownWordLength; } return(unknownWordEndIndex); }
public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode) { DoubleArrayTrie = doubleArrayTrie; Dictionary = dictionary; UnknownDictionary = unknownDictionary; UserDictionary = userDictionary; UseUserDictionary = userDictionary != null; SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended; CharacterDefinitions = UnknownDictionary.CharacterDefinition; }
public void SetUp() { SortedDictionary <string, int> categoryMap; using (var outputStream = File.Create(charDef)) { CharacterDefinitionsCompiler charDefCompiler = new CharacterDefinitionsCompiler(CodePagesEncodingProvider.Instance); string assetFileName = @"./Core/Resource/char.def"; using (var defStream = File.OpenRead(assetFileName)) { charDefCompiler.ReadCharacterDefinition(defStream, "euc-jp"); charDefCompiler.Compile(outputStream); } categoryMap = charDefCompiler.MakeCharacterCategoryMap(); } var unkDefFile = TestUtils.CompiledPath + Path.DirectorySeparatorChar + "kuromoji-unkdef-.bin"; using (var outputStream = File.Create(unkDefFile)) { UnknownDictionaryCompiler unkDefCompiler = new UnknownDictionaryCompiler(categoryMap); string assetFileName = @"./Core/Resource/unk.def"; using (var defStream = File.OpenRead(assetFileName)) { unkDefCompiler.ReadUnknownDefinition(defStream, "euc-jp"); unkDefCompiler.Compile(outputStream); } } using (var charDefInput = File.OpenRead(charDef)) using (var reader = new BinaryReader(charDefInput)) { int[][] definitions = IntegerArrayIO.ReadSparseArray2D(reader); int[][] mappings = IntegerArrayIO.ReadSparseArray2D(reader); string[] symbols = StringArrayIO.ReadArray(reader); characterDefinitions = new CharacterDefinitions(definitions, mappings, symbols); } using (var unkDefInput = File.OpenRead(unkDefFile)) using (var reader = new BinaryReader(unkDefInput)) { costs = IntegerArrayIO.ReadArray2D(reader); references = IntegerArrayIO.ReadArray2D(reader); features = StringArrayIO.ReadArray2D(reader); unknownDictionary = new UnknownDictionary(characterDefinitions, references, costs, features); } }
public ViterbiSearcher(TokenizerMode mode, ConnectionCosts costs, UnknownDictionary unknownDictionary, List <int> penalties) { if (penalties.Count != 0) { KanjiPenaltyLengthThreshold = penalties[0]; KanjiPenalty = penalties[1]; OtherPenaltyLengthThreshold = penalties[2]; OtherPenalty = penalties[3]; } Mode = mode; Costs = costs; UnknownDictionary = unknownDictionary; MultiSearcher = new MultiSearcher(costs, mode, this); }
public virtual void LoadDictionaries() { try { Fst = FST.FST.NewInstance(AbsoluteFolderPath); ConnectionCosts = ConnectionCosts.NewInstance(AbsoluteFolderPath); TokenInfoDictionary = TokenInfoDictionary.NewInstance(AbsoluteFolderPath); CharacterDefinitions = CharacterDefinitions.NewInstance(AbsoluteFolderPath); UnknownDictionary = UnknownDictionary.NewInstance(AbsoluteFolderPath, CharacterDefinitions, totalFeatures); InsertedDictionary = new InsertedDictionary(totalFeatures); } catch (Exception ouch) { throw new Exception("Could not load dictionaries.", ouch); } }
public ViterbiSearcher(Mode mode, ConnectionCosts costs, UnknownDictionary unknownDictionary, List <int> penalties) { if (!(penalties.Count == 0)) { this.kanjiPenaltyLengthThreshold = penalties[0]; this.kanjiPenalty = penalties[1]; this.otherPenaltyLengthThreshold = penalties[2]; this.otherPenalty = penalties[3]; } this.mode = mode; this.costs = costs; this.unknownDictionary = unknownDictionary; multiSearcher = new MultiSearcher(costs, mode, this); }
/** * Constructor * * @param fst FST with surface forms * @param dictionary token info dictionary * @param unknownDictionary unknown word dictionary * @param userDictionary user dictionary * @param mode tokenization {@link Mode mode} */ public ViterbiBuilder(FST.FST fst, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, Mode mode) { this.fst = fst; this.dictionary = dictionary; this.unknownDictionary = unknownDictionary; this.userDictionary = userDictionary; this.useUserDictionary = (userDictionary != null); if (mode == Mode.SEARCH || mode == Mode.EXTENDED) { searchMode = true; } this.characterDefinitions = unknownDictionary.GetCharacterDefinition(); }
public void TestCostsAndFeatures() { var categories = CharacterDefinitions.LookupCategories('一'); // KANJI & KANJINUMERIC categories.Length.Is(2); categories.SequenceEqual(new int[] { 5, 6 }).IsTrue(); // KANJI entries UnknownDictionary.LookupWordIds(categories[0]).SequenceEqual(new int[] { 2, 3, 4, 5, 6, 7 }).IsTrue(); // KANJI feature variety UnknownDictionary.GetAllFeaturesArray(2).SequenceEqual(new string[] { "名詞", "一般", "*", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(3).SequenceEqual(new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(4).SequenceEqual(new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(5).SequenceEqual(new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue(); UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue(); // KANJINUMERIC entry UnknownDictionary.LookupWordIds(categories[1]).SequenceEqual(new int[] { 29 }).IsTrue(); // KANJINUMERIC costs UnknownDictionary.GetLeftId(29).Is(1295); UnknownDictionary.GetRightId(29).Is(1295); UnknownDictionary.GetWordCost(29).Is(27473); // KANJINUMERIC features UnknownDictionary.GetAllFeaturesArray(29).SequenceEqual(new string[] { "名詞", "数", "*", "*", "*", "*", "*" }).IsTrue(); }