public TextGenerator(WordTypes wordType = WordTypes.Word) { if (wordType == WordTypes.Name) this.corpus = Corpus.DeserializeFromEmbeddedResource("names.bin"); else this.corpus = Corpus.DeserializeFromEmbeddedResource("text.bin"); this.wordType = wordType; }
/// <summary> /// Initializes a new instance of the <see cref="TextGenerator"/> class. /// Will generate text based on provided corpus. Use this to generate fake text based on different languages. /// </summary> /// <param name="corpus">The corpus.</param> public TextGenerator(Corpus corpus) { if (corpus == null) throw new ArgumentException("Corpus is not provided."); this.corpus = corpus; }
/// <summary> /// Creates the corpus from text. /// Text should be long enough to generate decent results (e.g. 10Kb+). /// </summary> /// <param name="text">The text.</param> /// <returns></returns> public static Corpus CreateFromText(string text) { Dictionary<LetterStats, LetterStats> letterData = new Dictionary<LetterStats, LetterStats>(); int charCount = 0; // clean text string clean = Regex.Replace(text, "[^\\p{L} \t\n]", string.Empty).ToLower(); string[] words = clean.Split(new char[] { ' ', '\n', '\t' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { char prevLetter = '\0'; for (int l = 0; l < words[i].Length; l++) { LetterStats ls = new LetterStats(words[i][l], prevLetter, l); if (letterData.ContainsKey(ls)) letterData[ls].IncrementCount(); else letterData.Add(ls, ls); prevLetter = words[i][l]; charCount++; } } Corpus c = new Corpus() { CharCount = charCount, WordCount = words.Length }; c.letters.AddRange(letterData.Keys); return c; }