Example #1
0
        public void Load(Stream stream)
        {
            if (stream == null)
            {
                throw new ArgumentNullException("stream");
            }

            perceptronMsd = new Perceptron();
            perceptronMsd.Load(stream);
            perceptronLemma = new PerceptronString();
            perceptronLemma.Load(stream);
        }
Example #2
0
        public void Train(IEnumerable <Sentence> sentences)
        {
            perceptronMsd   = new Perceptron();
            perceptronLemma = new PerceptronString();

            IndexedSentence[] normalizedSentences =
                sentences.Select(s => new IndexedSentence((Reverse ? (s as IEnumerable <Token>).Reverse() : s).Select(t => Normalize(t)))).ToArray();

            HashSet <Tag> alltags = new HashSet <Tag>();

            foreach (IndexedSentence sentence in normalizedSentences)
            {
                foreach (Token token in sentence)
                {
                    alltags.Add(new Tag(token.CorrectTag.Msd, null));
                    foreach (Tag tag in token.PossibleTags)
                    {
                        alltags.Add(new Tag(tag.Msd, null));
                    }
                }
            }

            for (int iteration = 0; iteration < Iterations; iteration++)
            {
                if (IterationStarted != null)
                {
                    IterationStarted(iteration + 1);
                }

                foreach (IndexedSentence sentence in normalizedSentences.OrderBy(s => random.NextUInt()))
                {
                    foreach (Token token in sentence)
                    {
                        if (token.PossibleTags != null && token.PossibleTags.Length == 1)
                        {
                            token.PredictedTag = token.CorrectTag; // todo: possibleTags[0]?? what if it's not correct tag?
                            continue;
                        }

                        IEnumerable <Tag> possibleTags = token.PossibleTags.OrderBy(t => t.Msd).ToArray();
                        if (possibleTags == null || !possibleTags.Any())
                        {
                            possibleTags = alltags;
                        }

                        double?  bestMsdScore    = null;
                        Tag      bestMsd         = null;
                        Features bestMsdFeatures = null;

                        Features localFeaturesTag = GetFeatures(featureTemplatesTag, token, sentence, true);

                        foreach (Tag tag in possibleTags)
                        {
                            Tag tagMsd = new Tag(tag.Msd);
                            token.PredictedTag = tagMsd;

                            Features featuresTag = GetFeatures(featureTemplatesTag, token, sentence, false);
                            featuresTag.AddRange(localFeaturesTag);

                            double score = perceptronMsd.Score(featuresTag, tagMsd);
                            if (bestMsdScore == null || score > bestMsdScore.Value)
                            {
                                bestMsdScore    = score;
                                bestMsd         = tagMsd;
                                bestMsdFeatures = featuresTag;
                            }
                        }

                        perceptronMsd.Update(bestMsdFeatures, new Tag(token.CorrectTag.Msd), bestMsd);

                        if (token.CorrectTag.Lemma != null)
                        {
                            double?  bestLemmaScore    = null;
                            string   bestLemma         = null;
                            Features bestLemmaFeatures = null;

                            token.PredictedTag = new Tag(token.CorrectTag.Msd);
                            Features localFeaturesLemma = GetFeatures(featureTemplatesLemma, token, sentence, true);

                            foreach (Tag tag in possibleTags)
                            {
                                if (tag.Msd != bestMsd.Msd || tag.Lemma == null)
                                {
                                    continue;
                                }
                                token.PredictedTag = new Tag(token.CorrectTag.Msd, tag.Lemma);
                                Features featuresLemma = GetFeatures(featureTemplatesLemma, token, sentence, false);
                                featuresLemma.AddRange(localFeaturesLemma);
                                double scoreLemma = perceptronLemma.Score(featuresLemma, tag.Lemma);
                                if (bestLemmaScore == null || scoreLemma > bestLemmaScore.Value)
                                {
                                    bestLemmaScore    = scoreLemma;
                                    bestLemma         = tag.Lemma;
                                    bestLemmaFeatures = featuresLemma;
                                }
                            }

                            if (bestLemmaFeatures != null)
                            {
                                perceptronLemma.Update(bestLemmaFeatures, token.CorrectTag.Lemma, bestLemma);
                            }
                        }

                        token.PredictedTag = token.CorrectTag;
                    }
                }

                if (IterationFinished != null)
                {
                    IterationFinished(iteration + 1);
                }
            }

            if (Average)
            {
                perceptronMsd.AverageWeights();
                perceptronLemma.AverageWeights();
            }

            perceptronMsd.RemoveInsignificantWeights(WeightThreshold);
            perceptronLemma.RemoveInsignificantWeights(WeightThreshold);
        }