private Sentence[] LoadAnalyzedCorpus(string resourceName) { LuMiiCorpus corpus = new LuMiiCorpus(); using (Stream stream = this.GetType().Assembly.GetManifestResourceStream(resourceName)) return corpus.Load(stream).ToArray(); }
private Sentence[] LoadUnanalyzedCorpus(string resourceName, bool ignoreIncorrect = false) { LuMiiCorpus corpus = new LuMiiCorpus(); LuMiiMorphology morphology = new LuMiiMorphology(); Sentence[] sentences = null; using (Stream stream = this.GetType().Assembly.GetManifestResourceStream(resourceName)) sentences = corpus.Load(stream).ToArray(); List<Sentence> goodSentences = new List<Sentence>(); List<Sentence> ignoredSentences = new List<Sentence>(); List<Token> ignoredTokens = new List<Token>(); Stopwatch watch = new Stopwatch(); watch.Start(); foreach (Sentence sentence in sentences) { bool ignore = false; Sentence analyzedSentence = new Sentence(); foreach (Token token in sentence) { Tag[] possibleTags = morphology.Analyze(token.TextTrueCase).ToArray(); if (!possibleTags.Any(t => t.Equals(token.CorrectTag))) ignore = true; Token analyzedToken = new Token(token.TextTrueCase, possibleTags, token.CorrectTag, analyzedSentence); analyzedSentence.Add(analyzedToken); } if (!ignoreIncorrect || !ignore) { goodSentences.Add(analyzedSentence); } else { ignoredSentences.Add(analyzedSentence); } } watch.Stop(); Debug.WriteLine(watch.Elapsed); return goodSentences.ToArray(); }
public void Model_Latest() { LuMiiCorpus corpus = new LuMiiCorpus(); List<Sentence> all = new List<Sentence>(); foreach (string resource in ModelLatestData) all.AddRange(corpus.Load(this.GetType().Assembly.GetManifestResourceStream(resource)).ToArray()); LuMiiTagger tagger = new LuMiiTagger(); tagger.Load(); tagger.Tag(all); Token[] tokens = all.SelectMany(t => t).ToArray(); double accuracy = (double)tokens.Count(t => t.IsTagCorrect) / tokens.Count(); Assert.Greater(accuracy, 0.99); }