public void Train(IEnumerable<Sentence> sentences) { perceptronMsd = new Perceptron(); perceptronLemma = new PerceptronString(); IndexedSentence[] normalizedSentences = sentences.Select(s => new IndexedSentence((Reverse ? (s as IEnumerable<Token>).Reverse() : s).Select(t => Normalize(t)))).ToArray(); HashSet<Tag> alltags = new HashSet<Tag>(); foreach (IndexedSentence sentence in normalizedSentences) { foreach (Token token in sentence) { alltags.Add(new Tag(token.CorrectTag.Msd, null)); foreach (Tag tag in token.PossibleTags) alltags.Add(new Tag(tag.Msd, null)); } } for (int iteration = 0; iteration < Iterations; iteration++) { if (IterationStarted != null) IterationStarted(iteration + 1); foreach (IndexedSentence sentence in normalizedSentences.OrderBy(s => random.NextUInt())) { foreach (Token token in sentence) { if (token.PossibleTags != null && token.PossibleTags.Length == 1) { token.PredictedTag = token.CorrectTag; // todo: possibleTags[0]?? what if it's not correct tag? continue; } IEnumerable<Tag> possibleTags = token.PossibleTags.OrderBy(t => t.Msd).ToArray(); if (possibleTags == null || !possibleTags.Any()) possibleTags = alltags; double? bestMsdScore = null; Tag bestMsd = null; Features bestMsdFeatures = null; Features localFeaturesTag = GetFeatures(featureTemplatesTag, token, sentence, true); foreach (Tag tag in possibleTags) { Tag tagMsd = new Tag(tag.Msd); token.PredictedTag = tagMsd; Features featuresTag = GetFeatures(featureTemplatesTag, token, sentence, false); featuresTag.AddRange(localFeaturesTag); double score = perceptronMsd.Score(featuresTag, tagMsd); if (bestMsdScore == null || score > bestMsdScore.Value) { bestMsdScore = score; bestMsd = tagMsd; bestMsdFeatures = featuresTag; } } perceptronMsd.Update(bestMsdFeatures, new Tag(token.CorrectTag.Msd), bestMsd); if (token.CorrectTag.Lemma != null) { double? bestLemmaScore = null; string bestLemma = null; Features bestLemmaFeatures = null; token.PredictedTag = new Tag(token.CorrectTag.Msd); Features localFeaturesLemma = GetFeatures(featureTemplatesLemma, token, sentence, true); foreach (Tag tag in possibleTags) { if (tag.Msd != bestMsd.Msd || tag.Lemma == null) continue; token.PredictedTag = new Tag(token.CorrectTag.Msd, tag.Lemma); Features featuresLemma = GetFeatures(featureTemplatesLemma, token, sentence, false); featuresLemma.AddRange(localFeaturesLemma); double scoreLemma = perceptronLemma.Score(featuresLemma, tag.Lemma); if (bestLemmaScore == null || scoreLemma > bestLemmaScore.Value) { bestLemmaScore = scoreLemma; bestLemma = tag.Lemma; bestLemmaFeatures = featuresLemma; } } if (bestLemmaFeatures != null) perceptronLemma.Update(bestLemmaFeatures, token.CorrectTag.Lemma, bestLemma); } token.PredictedTag = token.CorrectTag; } } if (IterationFinished != null) IterationFinished(iteration + 1); } if (Average) { perceptronMsd.AverageWeights(); perceptronLemma.AverageWeights(); } perceptronMsd.RemoveInsignificantWeights(WeightThreshold); perceptronLemma.RemoveInsignificantWeights(WeightThreshold); }
public void Load(Stream stream) { if (stream == null) throw new ArgumentNullException("stream"); perceptronMsd = new Perceptron(); perceptronMsd.Load(stream); perceptronLemma = new PerceptronString(); perceptronLemma.Load(stream); }