Пример #1
0
        public string DoCorrect(string sentence)
        {
            var finalSentence = sentence;
            var words         = _segmenter.Cut(sentence);
            var correctPoints = GetCandidateCorrectPoint(words);
            var end           = 0;

            foreach (var cp in ReOrderCorrectPoint(correctPoints))
            {
                if (cp.WordIndex <= end)
                {
                    continue;
                }

                var isCorrected = CheckCorrectPoint(words, cp);
                if (!isCorrected)
                {
                    continue;
                }

                finalSentence = finalSentence.Substring(0, cp.CharIndex) + cp.Word +
                                finalSentence.Substring(cp.CharIndex + cp.Length);
                end = cp.WordIndex + cp.Length - 1;
            }
            return(finalSentence);
        }
Пример #2
0
        public void Execution()
        {
            var           unigram = new Dictionary <string, int>();
            var           bigram = new Dictionary <string, Dictionary <string, int> >();
            var           delimeters = ",.!;,。;!".ToCharArray();
            DirectoryInfo inputDirectoryInfo = new DirectoryInfo(InputDataFolder);
            long          handledFileNumber = 0, handledLineNumber = 0;

            foreach (var file in inputDirectoryInfo.EnumerateFiles())
            {
                handledFileNumber++;
                using (var inputFileReader = new StreamReader(file.FullName))
                {
                    string line = null;

                    while ((line = inputFileReader.ReadLine()) != null)
                    {
                        var tokens = line.Split('\t');
                        if (tokens.Length < 9)
                        {
                            continue;
                        }

                        var sentences = tokens[8].Split(delimeters, StringSplitOptions.RemoveEmptyEntries);
                        foreach (var sentence in sentences)
                        {
                            var words = _segmenter.Cut(sentence);
                            UnigramUpdate(words, unigram);
                            BigramUpdate(words, bigram);
                        }

                        handledLineNumber++;
                        if (handledLineNumber % 10000 == 0)
                        {
                            Console.WriteLine($"[{file.Name}]: We have handled {handledLineNumber} {(handledLineNumber == 1 ? "Line" : "Lines")}");
                        }
                    }
                }

                Console.WriteLine($"We have handled {handledFileNumber} {(handledFileNumber==1?"File":"Files")}");
            }

            var unigramProb = new Dictionary <string, double>();
            var bigramProb = new Dictionary <string, Dictionary <string, double> >();

            UnigramProbability(unigram, unigramProb);
            BigramProbability(bigram, bigramProb);
            SaveModel(unigramProb, bigramProb);
        }
Пример #3
0
        public override string Normalize(string text, bool lowerCase = false, bool removeStopWords = false, bool spellerCheck = false,
                                         bool useStem = false)
        {
            if (lowerCase)
            {
                text = text.ToLower();
            }

            text = spellerCheck ? _spellerModel.DoCorrect(text) : text;
            var wordsList = _segmenter.Cut(text);

            if (removeStopWords)
            {
                wordsList = wordsList.Where(x => !_stopwords.Contains(x)).ToList();
            }

            return(string.Join("", wordsList));
        }