public void Load(Stream stream) { if (stream == null) { throw new ArgumentNullException("stream"); } perceptronMsd = new Perceptron(); perceptronMsd.Load(stream); perceptronLemma = new PerceptronString(); perceptronLemma.Load(stream); }
public void Train(IEnumerable <Sentence> sentences) { perceptronMsd = new Perceptron(); perceptronLemma = new PerceptronString(); IndexedSentence[] normalizedSentences = sentences.Select(s => new IndexedSentence((Reverse ? (s as IEnumerable <Token>).Reverse() : s).Select(t => Normalize(t)))).ToArray(); HashSet <Tag> alltags = new HashSet <Tag>(); foreach (IndexedSentence sentence in normalizedSentences) { foreach (Token token in sentence) { alltags.Add(new Tag(token.CorrectTag.Msd, null)); foreach (Tag tag in token.PossibleTags) { alltags.Add(new Tag(tag.Msd, null)); } } } for (int iteration = 0; iteration < Iterations; iteration++) { if (IterationStarted != null) { IterationStarted(iteration + 1); } foreach (IndexedSentence sentence in normalizedSentences.OrderBy(s => random.NextUInt())) { foreach (Token token in sentence) { if (token.PossibleTags != null && token.PossibleTags.Length == 1) { token.PredictedTag = token.CorrectTag; // todo: possibleTags[0]?? what if it's not correct tag? continue; } IEnumerable <Tag> possibleTags = token.PossibleTags.OrderBy(t => t.Msd).ToArray(); if (possibleTags == null || !possibleTags.Any()) { possibleTags = alltags; } double? bestMsdScore = null; Tag bestMsd = null; Features bestMsdFeatures = null; Features localFeaturesTag = GetFeatures(featureTemplatesTag, token, sentence, true); foreach (Tag tag in possibleTags) { Tag tagMsd = new Tag(tag.Msd); token.PredictedTag = tagMsd; Features featuresTag = GetFeatures(featureTemplatesTag, token, sentence, false); featuresTag.AddRange(localFeaturesTag); double score = perceptronMsd.Score(featuresTag, tagMsd); if (bestMsdScore == null || score > bestMsdScore.Value) { bestMsdScore = score; bestMsd = tagMsd; bestMsdFeatures = featuresTag; } } perceptronMsd.Update(bestMsdFeatures, new Tag(token.CorrectTag.Msd), bestMsd); if (token.CorrectTag.Lemma != null) { double? bestLemmaScore = null; string bestLemma = null; Features bestLemmaFeatures = null; token.PredictedTag = new Tag(token.CorrectTag.Msd); Features localFeaturesLemma = GetFeatures(featureTemplatesLemma, token, sentence, true); foreach (Tag tag in possibleTags) { if (tag.Msd != bestMsd.Msd || tag.Lemma == null) { continue; } token.PredictedTag = new Tag(token.CorrectTag.Msd, tag.Lemma); Features featuresLemma = GetFeatures(featureTemplatesLemma, token, sentence, false); featuresLemma.AddRange(localFeaturesLemma); double scoreLemma = perceptronLemma.Score(featuresLemma, tag.Lemma); if (bestLemmaScore == null || scoreLemma > bestLemmaScore.Value) { bestLemmaScore = scoreLemma; bestLemma = tag.Lemma; bestLemmaFeatures = featuresLemma; } } if (bestLemmaFeatures != null) { perceptronLemma.Update(bestLemmaFeatures, token.CorrectTag.Lemma, bestLemma); } } token.PredictedTag = token.CorrectTag; } } if (IterationFinished != null) { IterationFinished(iteration + 1); } } if (Average) { perceptronMsd.AverageWeights(); perceptronLemma.AverageWeights(); } perceptronMsd.RemoveInsignificantWeights(WeightThreshold); perceptronLemma.RemoveInsignificantWeights(WeightThreshold); }