public CrossValidationResults Evaluate() { Helpers.XorShiftRandom random = new Helpers.XorShiftRandom(RandomSeed); List <Sentence> sentences = Randomize ? Sentences.OrderBy(s => random.NextUInt()).ToList() : Sentences; List <Sentence>[] folds = new List <Sentence> [Folds]; for (int i = 0; i < Folds; i++) { folds[i] = new List <Sentence>(); } for (int i = 0; i < sentences.Count; i++) { folds[i % Folds].Add(sentences[i]); } CrossValidationResults results = new CrossValidationResults(); DateTime start = DateTime.Now; int iterationsCount = 0; Parallel.For(0, Folds, (i) => { CrossValidationResults.Result result = new CrossValidationResults.Result(); result.Fold = i + 1; result.Test = folds[i].Select(sentence => sentence.Clone()).ToList(); result.Train = new List <Sentence>(); for (int j = 0; j < Folds; j++) { if (j != i) { result.Train.AddRange(folds[j].Select(sentence => sentence.Clone())); } } ITrainableTagger tagger = Activator.CreateInstance <T>(); if (tagger is Tagging.Perceptron.PerceptronTagger) { Tagging.Perceptron.PerceptronTagger ptagger = tagger as Tagging.Perceptron.PerceptronTagger; ptagger.IterationFinished += (it) => { lock (folds) iterationsCount++; if (Progress != null) { Progress(((double)iterationsCount / (ptagger.Iterations * Folds))); } }; } tagger.Train(result.Train); tagger.Tag(result.Test); results.Add(result); }); results.Duration = DateTime.Now - start; return(results); }
public void Train(IEnumerable <Sentence> sentences) { perceptronMsd = new Perceptron(); perceptronLemma = new PerceptronString(); IndexedSentence[] normalizedSentences = sentences.Select(s => new IndexedSentence((Reverse ? (s as IEnumerable <Token>).Reverse() : s).Select(t => Normalize(t)))).ToArray(); HashSet <Tag> alltags = new HashSet <Tag>(); foreach (IndexedSentence sentence in normalizedSentences) { foreach (Token token in sentence) { alltags.Add(new Tag(token.CorrectTag.Msd, null)); foreach (Tag tag in token.PossibleTags) { alltags.Add(new Tag(tag.Msd, null)); } } } for (int iteration = 0; iteration < Iterations; iteration++) { if (IterationStarted != null) { IterationStarted(iteration + 1); } foreach (IndexedSentence sentence in normalizedSentences.OrderBy(s => random.NextUInt())) { foreach (Token token in sentence) { if (token.PossibleTags != null && token.PossibleTags.Length == 1) { token.PredictedTag = token.CorrectTag; // todo: possibleTags[0]?? what if it's not correct tag? continue; } IEnumerable <Tag> possibleTags = token.PossibleTags.OrderBy(t => t.Msd).ToArray(); if (possibleTags == null || !possibleTags.Any()) { possibleTags = alltags; } double? bestMsdScore = null; Tag bestMsd = null; Features bestMsdFeatures = null; Features localFeaturesTag = GetFeatures(featureTemplatesTag, token, sentence, true); foreach (Tag tag in possibleTags) { Tag tagMsd = new Tag(tag.Msd); token.PredictedTag = tagMsd; Features featuresTag = GetFeatures(featureTemplatesTag, token, sentence, false); featuresTag.AddRange(localFeaturesTag); double score = perceptronMsd.Score(featuresTag, tagMsd); if (bestMsdScore == null || score > bestMsdScore.Value) { bestMsdScore = score; bestMsd = tagMsd; bestMsdFeatures = featuresTag; } } perceptronMsd.Update(bestMsdFeatures, new Tag(token.CorrectTag.Msd), bestMsd); if (token.CorrectTag.Lemma != null) { double? bestLemmaScore = null; string bestLemma = null; Features bestLemmaFeatures = null; token.PredictedTag = new Tag(token.CorrectTag.Msd); Features localFeaturesLemma = GetFeatures(featureTemplatesLemma, token, sentence, true); foreach (Tag tag in possibleTags) { if (tag.Msd != bestMsd.Msd || tag.Lemma == null) { continue; } token.PredictedTag = new Tag(token.CorrectTag.Msd, tag.Lemma); Features featuresLemma = GetFeatures(featureTemplatesLemma, token, sentence, false); featuresLemma.AddRange(localFeaturesLemma); double scoreLemma = perceptronLemma.Score(featuresLemma, tag.Lemma); if (bestLemmaScore == null || scoreLemma > bestLemmaScore.Value) { bestLemmaScore = scoreLemma; bestLemma = tag.Lemma; bestLemmaFeatures = featuresLemma; } } if (bestLemmaFeatures != null) { perceptronLemma.Update(bestLemmaFeatures, token.CorrectTag.Lemma, bestLemma); } } token.PredictedTag = token.CorrectTag; } } if (IterationFinished != null) { IterationFinished(iteration + 1); } } if (Average) { perceptronMsd.AverageWeights(); perceptronLemma.AverageWeights(); } perceptronMsd.RemoveInsignificantWeights(WeightThreshold); perceptronLemma.RemoveInsignificantWeights(WeightThreshold); }