public static void Setup(TestContext context)
        {
            using (var charDefOutput = new MemoryStream())
                using (var unkDicOutput = new MemoryStream())
                    using (var charDefResource = GetResource("char.def"))
                        using (var unkDefResource = GetResource("unk.def"))
                        {
                            var charDefCompiler = new CharacterDefinitionsCompiler(charDefOutput);
                            charDefCompiler.ReadCharacterDefinition(charDefResource, Encoding.GetEncoding("euc-jp"));
                            charDefCompiler.Compile();

                            var categoryMap = charDefCompiler.MakeCharacterCategoryMap();

                            var unkDefCompiler = new UnknownDictionaryCompiler(unkDicOutput, categoryMap);
                            unkDefCompiler.ReadUnknownDefinition(unkDefResource, Encoding.GetEncoding("euc-jp"));
                            unkDefCompiler.Compile();

                            charDefOutput.Seek(0, SeekOrigin.Begin);
                            unkDicOutput.Seek(0, SeekOrigin.Begin);

                            var definitions = IntArrayIO.ReadSparseArray2D(charDefOutput);
                            var mappings    = IntArrayIO.ReadSparseArray2D(charDefOutput);
                            var symbols     = StringArrayIO.ReadArray(charDefOutput);

                            CharacterDefinitions = new CharacterDefinitions(definitions, mappings, symbols);

                            Costs      = IntArrayIO.ReadArray2D(unkDicOutput);
                            References = IntArrayIO.ReadArray2D(unkDicOutput);
                            Features   = StringArrayIO.ReadArray2D(unkDicOutput);

                            UnknownDictionary = new UnknownDictionary(CharacterDefinitions, References, Costs, Features);
                        }
        }
            public override void LoadDictionaries()
            {
                Penalties = new List <int>();
                Penalties.Add(kanjiPenaltyLengthTreshold);
                Penalties.Add(kanjiPenalty);
                Penalties.Add(otherPenaltyLengthThreshold);
                Penalties.Add(otherPenalty);

                try
                {
                    Fst                  = FST.NewInstance(AbsoluteFolderPath);
                    ConnectionCosts      = ConnectionCosts.NewInstance(AbsoluteFolderPath);
                    TokenInfoDictionary  = TokenInfoDictionary.NewInstance(AbsoluteFolderPath);
                    CharacterDefinitions = CharacterDefinitions.NewInstance(AbsoluteFolderPath);

                    if (IsSplitOnNakaguro)
                    {
                        CharacterDefinitions.SetCategories('・', new string[] { "SYMBOL" });
                    }

                    UnknownDictionary  = UnknownDictionary.NewInstance(AbsoluteFolderPath, CharacterDefinitions, totalFeatures);
                    InsertedDictionary = new InsertedDictionary(totalFeatures);
                }
                catch (Exception ouch)
                {
                    throw new Exception("Could not load dictionaries: " + ouch.Message);
                }
            }
Example #3
0
        protected void Configure <V, K>(V builder) where V : Builder <K> where K : TokenizerBase <T>
        {
            builder.LoadDictionaries();

            this.tokenFactory = builder.TokenFactory;

            this.tokenInfoDictionary = builder.TokenInfoDictionary;
            this.unknownDictionary   = builder.UnknownDictionary;
            this.userDictionary      = builder.UserDictionary;
            this.insertedDictionary  = builder.InsertedDictionary;

            this.viterbiBuilder = new ViterbiBuilder(
                builder.Fst,
                tokenInfoDictionary,
                unknownDictionary,
                userDictionary,
                builder.Mode
                );

            this.viterbiSearcher = new ViterbiSearcher(
                builder.Mode,
                builder.ConnectionCosts,
                unknownDictionary,
                builder.Penalties
                );

            this.viterbiFormatter = new ViterbiFormatter(builder.ConnectionCosts);
            this.split            = builder.Split;

            InitDictionaryMap();
        }
Example #4
0
 protected internal virtual void LoadDictionaries()
 {
     DoubleArrayTrie      = DoubleArrayTrie.NewInstance(Resolver);
     ConnectionCosts      = ConnectionCosts.NewInstance(Resolver);
     TokenInfoDictionary  = TokenInfoDictionary.NewInstance(Resolver);
     CharacterDefinitions = CharacterDefinitions.NewInstance(Resolver);
     UnknownDictionary    = UnknownDictionary.NewInstance(Resolver, CharacterDefinitions, TotalFeatures);
     InsertedDictionary   = new InsertedDictionary(TotalFeatures);
 }
Example #5
0
        int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found)
        {
            var unknownWordLength = 0;
            var definition        = CharacterDefinitions.LookupDefinition(category);

            if (definition[CharacterDefinitions.Invoke] == 1 || found == false)
            {
                if (definition[CharacterDefinitions.Group] == 0)
                {
                    unknownWordLength = 1;
                }
                else
                {
                    unknownWordLength = 1;
                    for (var j = 1; j < suffix.Length; j++)
                    {
                        var c = suffix[j];

                        var categories = CharacterDefinitions.LookupCategories(c);

                        if (categories == null)
                        {
                            break;
                        }

                        if (i < categories.Length && category == categories[i])
                        {
                            unknownWordLength++;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (unknownWordLength > 0)
            {
                var unkWord = suffix.Substring(0, unknownWordLength);
                var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same

                foreach (var wordId in wordIds)
                {
                    var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown);
                    lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
                }
                unknownWordEndIndex = startIndex + unknownWordLength;
            }

            return(unknownWordEndIndex);
        }
Example #6
0
        public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode)
        {
            DoubleArrayTrie   = doubleArrayTrie;
            Dictionary        = dictionary;
            UnknownDictionary = unknownDictionary;
            UserDictionary    = userDictionary;

            UseUserDictionary = userDictionary != null;

            SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended;

            CharacterDefinitions = UnknownDictionary.CharacterDefinition;
        }
        public void SetUp()
        {
            SortedDictionary <string, int> categoryMap;

            using (var outputStream = File.Create(charDef))
            {
                CharacterDefinitionsCompiler charDefCompiler = new CharacterDefinitionsCompiler(CodePagesEncodingProvider.Instance);
                string assetFileName = @"./Core/Resource/char.def";
                using (var defStream = File.OpenRead(assetFileName))
                {
                    charDefCompiler.ReadCharacterDefinition(defStream, "euc-jp");
                    charDefCompiler.Compile(outputStream);
                }

                categoryMap = charDefCompiler.MakeCharacterCategoryMap();
            }

            var unkDefFile = TestUtils.CompiledPath + Path.DirectorySeparatorChar + "kuromoji-unkdef-.bin";

            using (var outputStream = File.Create(unkDefFile))
            {
                UnknownDictionaryCompiler unkDefCompiler = new UnknownDictionaryCompiler(categoryMap);
                string assetFileName = @"./Core/Resource/unk.def";
                using (var defStream = File.OpenRead(assetFileName))
                {
                    unkDefCompiler.ReadUnknownDefinition(defStream, "euc-jp");
                    unkDefCompiler.Compile(outputStream);
                }
            }

            using (var charDefInput = File.OpenRead(charDef))
                using (var reader = new BinaryReader(charDefInput))
                {
                    int[][]  definitions = IntegerArrayIO.ReadSparseArray2D(reader);
                    int[][]  mappings    = IntegerArrayIO.ReadSparseArray2D(reader);
                    string[] symbols     = StringArrayIO.ReadArray(reader);

                    characterDefinitions = new CharacterDefinitions(definitions, mappings, symbols);
                }

            using (var unkDefInput = File.OpenRead(unkDefFile))
                using (var reader = new BinaryReader(unkDefInput))
                {
                    costs      = IntegerArrayIO.ReadArray2D(reader);
                    references = IntegerArrayIO.ReadArray2D(reader);
                    features   = StringArrayIO.ReadArray2D(reader);

                    unknownDictionary = new UnknownDictionary(characterDefinitions, references, costs, features);
                }
        }
Example #8
0
        public ViterbiSearcher(TokenizerMode mode, ConnectionCosts costs, UnknownDictionary unknownDictionary, List <int> penalties)
        {
            if (penalties.Count != 0)
            {
                KanjiPenaltyLengthThreshold = penalties[0];
                KanjiPenalty = penalties[1];
                OtherPenaltyLengthThreshold = penalties[2];
                OtherPenalty = penalties[3];
            }

            Mode              = mode;
            Costs             = costs;
            UnknownDictionary = unknownDictionary;
            MultiSearcher     = new MultiSearcher(costs, mode, this);
        }
Example #9
0
 public virtual void LoadDictionaries()
 {
     try
     {
         Fst                  = FST.FST.NewInstance(AbsoluteFolderPath);
         ConnectionCosts      = ConnectionCosts.NewInstance(AbsoluteFolderPath);
         TokenInfoDictionary  = TokenInfoDictionary.NewInstance(AbsoluteFolderPath);
         CharacterDefinitions = CharacterDefinitions.NewInstance(AbsoluteFolderPath);
         UnknownDictionary    = UnknownDictionary.NewInstance(AbsoluteFolderPath, CharacterDefinitions, totalFeatures);
         InsertedDictionary   = new InsertedDictionary(totalFeatures);
     }
     catch (Exception ouch)
     {
         throw new Exception("Could not load dictionaries.", ouch);
     }
 }
Example #10
0
        public ViterbiSearcher(Mode mode,
                               ConnectionCosts costs,
                               UnknownDictionary unknownDictionary,
                               List <int> penalties)
        {
            if (!(penalties.Count == 0))
            {
                this.kanjiPenaltyLengthThreshold = penalties[0];
                this.kanjiPenalty = penalties[1];
                this.otherPenaltyLengthThreshold = penalties[2];
                this.otherPenalty = penalties[3];
            }

            this.mode              = mode;
            this.costs             = costs;
            this.unknownDictionary = unknownDictionary;
            multiSearcher          = new MultiSearcher(costs, mode, this);
        }
        /**
         * Constructor
         *
         * @param fst  FST with surface forms
         * @param dictionary  token info dictionary
         * @param unknownDictionary  unknown word dictionary
         * @param userDictionary  user dictionary
         * @param mode  tokenization {@link Mode mode}
         */
        public ViterbiBuilder(FST.FST fst,
                              TokenInfoDictionary dictionary,
                              UnknownDictionary unknownDictionary,
                              UserDictionary userDictionary,
                              Mode mode)
        {
            this.fst               = fst;
            this.dictionary        = dictionary;
            this.unknownDictionary = unknownDictionary;
            this.userDictionary    = userDictionary;

            this.useUserDictionary = (userDictionary != null);

            if (mode == Mode.SEARCH || mode == Mode.EXTENDED)
            {
                searchMode = true;
            }
            this.characterDefinitions = unknownDictionary.GetCharacterDefinition();
        }
        public void TestCostsAndFeatures()
        {
            var categories = CharacterDefinitions.LookupCategories('一');

            // KANJI & KANJINUMERIC
            categories.Length.Is(2);

            categories.SequenceEqual(new int[] { 5, 6 }).IsTrue();

            // KANJI entries
            UnknownDictionary.LookupWordIds(categories[0]).SequenceEqual(new int[] { 2, 3, 4, 5, 6, 7 }).IsTrue();

            // KANJI feature variety
            UnknownDictionary.GetAllFeaturesArray(2).SequenceEqual(new string[] { "名詞", "一般", "*", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(3).SequenceEqual(new string[] { "名詞", "サ変接続", "*", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(4).SequenceEqual(new string[] { "名詞", "固有名詞", "地域", "一般", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(5).SequenceEqual(new string[] { "名詞", "固有名詞", "組織", "*", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue();

            UnknownDictionary.GetAllFeaturesArray(6).SequenceEqual(new string[] { "名詞", "固有名詞", "人名", "一般", "*", "*", "*" }).IsTrue();

            // KANJINUMERIC entry
            UnknownDictionary.LookupWordIds(categories[1]).SequenceEqual(new int[] { 29 }).IsTrue();

            // KANJINUMERIC costs
            UnknownDictionary.GetLeftId(29).Is(1295);
            UnknownDictionary.GetRightId(29).Is(1295);
            UnknownDictionary.GetWordCost(29).Is(27473);

            // KANJINUMERIC features
            UnknownDictionary.GetAllFeaturesArray(29).SequenceEqual(new string[] { "名詞", "数", "*", "*", "*", "*", "*" }).IsTrue();
        }