public string DoCorrect(string sentence) { var finalSentence = sentence; var words = _segmenter.Cut(sentence); var correctPoints = GetCandidateCorrectPoint(words); var end = 0; foreach (var cp in ReOrderCorrectPoint(correctPoints)) { if (cp.WordIndex <= end) { continue; } var isCorrected = CheckCorrectPoint(words, cp); if (!isCorrected) { continue; } finalSentence = finalSentence.Substring(0, cp.CharIndex) + cp.Word + finalSentence.Substring(cp.CharIndex + cp.Length); end = cp.WordIndex + cp.Length - 1; } return(finalSentence); }
public void Execution() { var unigram = new Dictionary <string, int>(); var bigram = new Dictionary <string, Dictionary <string, int> >(); var delimeters = ",.!;,。;!".ToCharArray(); DirectoryInfo inputDirectoryInfo = new DirectoryInfo(InputDataFolder); long handledFileNumber = 0, handledLineNumber = 0; foreach (var file in inputDirectoryInfo.EnumerateFiles()) { handledFileNumber++; using (var inputFileReader = new StreamReader(file.FullName)) { string line = null; while ((line = inputFileReader.ReadLine()) != null) { var tokens = line.Split('\t'); if (tokens.Length < 9) { continue; } var sentences = tokens[8].Split(delimeters, StringSplitOptions.RemoveEmptyEntries); foreach (var sentence in sentences) { var words = _segmenter.Cut(sentence); UnigramUpdate(words, unigram); BigramUpdate(words, bigram); } handledLineNumber++; if (handledLineNumber % 10000 == 0) { Console.WriteLine($"[{file.Name}]: We have handled {handledLineNumber} {(handledLineNumber == 1 ? "Line" : "Lines")}"); } } } Console.WriteLine($"We have handled {handledFileNumber} {(handledFileNumber==1?"File":"Files")}"); } var unigramProb = new Dictionary <string, double>(); var bigramProb = new Dictionary <string, Dictionary <string, double> >(); UnigramProbability(unigram, unigramProb); BigramProbability(bigram, bigramProb); SaveModel(unigramProb, bigramProb); }
public override string Normalize(string text, bool lowerCase = false, bool removeStopWords = false, bool spellerCheck = false, bool useStem = false) { if (lowerCase) { text = text.ToLower(); } text = spellerCheck ? _spellerModel.DoCorrect(text) : text; var wordsList = _segmenter.Cut(text); if (removeStopWords) { wordsList = wordsList.Where(x => !_stopwords.Contains(x)).ToList(); } return(string.Join("", wordsList)); }