/// <summary> /// Creates the corpus from text. /// Text should be long enough to generate decent results (e.g. 10Kb+). /// </summary> /// <param name="text">The text.</param> /// <returns></returns> public static Corpus CreateFromText(string text) { Dictionary<LetterStats, LetterStats> letterData = new Dictionary<LetterStats, LetterStats>(); int charCount = 0; // clean text string clean = Regex.Replace(text, "[^\\p{L} \t\n]", string.Empty).ToLower(); string[] words = clean.Split(new char[] { ' ', '\n', '\t' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { char prevLetter = '\0'; for (int l = 0; l < words[i].Length; l++) { LetterStats ls = new LetterStats(words[i][l], prevLetter, l); if (letterData.ContainsKey(ls)) letterData[ls].IncrementCount(); else letterData.Add(ls, ls); prevLetter = words[i][l]; charCount++; } } Corpus c = new Corpus() { CharCount = charCount, WordCount = words.Length }; c.letters.AddRange(letterData.Keys); return c; }
private IList <LetterStats> FillSingleLetterStats() { var dictLetter = new Dictionary <char, LetterStats>(); _stream.ResetPositionToStart(); while (!_stream.IsEof) { char c = _stream.ReadNextChar(); if (!char.IsLetter(c)) { continue; } if (!dictLetter.ContainsKey(c)) { dictLetter[c] = new LetterStats { Letter = c }; } dictLetter[c].IncStatistic(); } return(dictLetter.Values.ToList()); }
private IList <LetterStats> FillDoubleLetterStats() { char prevChar = default; // в ТЗ не указано последовательность ААА это одно двойное входждение или два. Считаем что одно var dictLetter = new Dictionary <char, LetterStats>(); _stream.ResetPositionToStart(); while (!_stream.IsEof) { char currentChar = _stream.ReadNextChar(); if (IsNotMatterChar(currentChar)) { continue; } if (!char.IsLetter(currentChar)) { prevChar = default; // сбосим предыдущий символ, потому что встаретили НЕ буквы, не значищие символы пропустили выше continue; } // наш случай if (char.ToUpper(prevChar) == char.ToUpper(currentChar)) { var letterStr = char.ToUpper(currentChar); if (!dictLetter.ContainsKey(letterStr)) { dictLetter[letterStr] = new LetterStats { Letter = letterStr }; } dictLetter[letterStr].IncStatistic(); prevChar = default; // вот тут особенность обработки строки ААА - считаем что предыдущего символа не было } else { prevChar = currentChar; } } return(dictLetter.Values.ToList()); }
/// <summary> /// This method will create a dictionary with characters as key, and a range for this character. Range is derived from character's frequency. /// </summary> /// <param name="letters">The letters.</param> /// <returns></returns> private static Dictionary<char, Range> MakeCharactersRange(LetterStats[] letters) { Dictionary<char, Range> ranged = new Dictionary<char, Range>(); int rangeStart = 0; foreach (var letter in letters) { if (ranged.ContainsKey(letter.Letter)) continue; Range r = new Range() { From = rangeStart, To = rangeStart + letter.Count }; ranged.Add(letter.Letter, r); rangeStart = rangeStart + letter.Count + 1; } return ranged; }