Пример #1
0
        protected internal override ParserModel trainAndUpdate(ParserModel originalModel, ObjectStream <Parse> parseSamples, ModelUpdaterParams parameters)
        {
            Dictionary mdict = ParserTrainerTool.buildDictionary(parseSamples, originalModel.HeadRules, parameters.Cutoff.Value);

            parseSamples.reset();

            // TODO: training individual models should be in the chunking parser, not here
            // Training build
            Console.WriteLine("Training builder");
            opennlp.model.EventStream bes        = new ParserEventStream(parseSamples, originalModel.HeadRules, ParserEventTypeEnum.BUILD, mdict);
            AbstractModel             buildModel = Parser.train(bes, parameters.Iterations.Value, parameters.Cutoff.Value);

            parseSamples.close();

            return(originalModel.updateBuildModel(buildModel));
        }
Пример #2
0
        /// <summary>
        /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off.
        /// </summary>
        /// <param name="data">The data stream of parses.</param>
        /// <param name="rules">The head rules for the parses.</param>
        /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param>
        /// <returns>A dictionary object.</returns>
        public static Dic BuildDictionary(IObjectStream <Parse> data, AbstractHeadRules rules, TrainingParameters parameters)
        {
            var cutoff = parameters.Get("dict", Parameters.Cutoff, 5);
            var dict   = new NGramModel();

            Parse p;

            while ((p = data.Read()) != null)
            {
                p.UpdateHeads(rules);
                var pWords = p.GetTagNodes();
                var words  = new string[pWords.Length];
                //add all uni-grams
                for (var wi = 0; wi < words.Length; wi++)
                {
                    words[wi] = pWords[wi].CoveredText;
                }

                dict.Add(new StringList(words), 1, 1);
                //add tri-grams and bi-grams for initial sequence
                var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags);
                var cWords = new string[chunks.Length];
                for (var wi = 0; wi < cWords.Length; wi++)
                {
                    cWords[wi] = chunks[wi].Head.CoveredText;
                }
                dict.Add(new StringList(cWords), 2, 3);

                //emulate reductions to produce additional n-grams
                var ci = 0;
                while (ci < chunks.Length)
                {
                    /*
                     * if (chunks[ci].Parent == null) {
                     *  chunks[ci].Show();
                     * } */
                    if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags))
                    {
                        //perform reduce
                        var reduceStart = ci;
                        while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent))
                        {
                            reduceStart--;
                        }
                        reduceStart++;
                        chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent);
                        ci     = reduceStart;
                        if (chunks.Length != 0)
                        {
                            var window = new string[5];
                            var wi     = 0;
                            if (ci - 2 >= 0)
                            {
                                window[wi++] = chunks[ci - 2].Head.CoveredText;
                            }
                            if (ci - 1 >= 0)
                            {
                                window[wi++] = chunks[ci - 1].Head.CoveredText;
                            }
                            window[wi++] = chunks[ci].Head.CoveredText;
                            if (ci + 1 < chunks.Length)
                            {
                                window[wi++] = chunks[ci + 1].Head.CoveredText;
                            }
                            if (ci + 2 < chunks.Length)
                            {
                                window[wi++] = chunks[ci + 2].Head.CoveredText;
                            }
                            if (wi < 5)
                            {
                                var subWindow = new string[wi];
                                for (var swi = 0; swi < wi; swi++)
                                {
                                    subWindow[swi] = window[swi];
                                }
                                window = subWindow;
                            }
                            if (window.Length >= 3)
                            {
                                dict.Add(new StringList(window), 2, 3);
                            }
                            else if (window.Length == 2)
                            {
                                dict.Add(new StringList(window), 2, 2);
                            }
                        }
                        ci = reduceStart - 1; //ci will be incremented at end of loop
                    }
                    ci++;
                }
            }
            dict.CutOff(cutoff, int.MaxValue);
            return(dict.ToDictionary(true));
        }