/// <summary> /// Processes the specified text. /// </summary> /// <param name="text">The text.</param> /// <returns>The resulting document object.</returns> public Document Process(string text) { var TempText = NormalizerManager.Normalize(text); var Tokens = Tokenizer.Tokenize(TempText, TokenizerLanguage); Tokens = NormalizerManager.Normalize(Tokens); Tokens = Stemmer.Stem(Tokens, StemmerLanguage); Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage); var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); for (int x = 0; x < Sentences.Length; ++x) { var Sentence = Sentences[x]; Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage); } Tokens = EntityFinder.Find(Tokens, EntityFinderType); Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage)); }
/// <summary> /// Take in a paragraph and replace all non-ignored words with a 'smarter' synonym. /// </summary> /// <param name="data">Paragraph to convert.</param> /// <returns>The 'improved' paragraph.</returns> public static string ConvertParagraph(string data) { StringBuilder output = new StringBuilder(); string[] sentences = MEDetector.Detect(data); foreach (string sentence in sentences) { string[] tokens = METokenizer.Tokenize(sentence); Span[] names = MENameFinder.Find(tokens); char[] sentenceArr = sentence.ToCharArray(); for (int cCharIndex = 0; cCharIndex < sentence.Length; cCharIndex++) { if (Char.IsUpper(sentenceArr[cCharIndex])) { bool isName = false; for (int cSpanIndex = 0; cSpanIndex < names.Length; cSpanIndex++) { if (cCharIndex == names[cSpanIndex].Start) { isName = true; } } if (!isName) { sentenceArr[cCharIndex] = Char.ToLower(sentenceArr[cCharIndex]); // TODO: Have to keep track of where the capitals were in the original sentence to add them again later. } } } tokens = METokenizer.Tokenize(new string(sentenceArr)); string[] tags = METagger.Tag(tokens); string[] chunks = MEChunker.Chunk(tokens, tags); Wnlib.PartsOfSpeech pos = Wnlib.PartsOfSpeech.Unknown; for (int i = 0; i < tokens.Length; i++) { if (!ConversionConditions.ExcludedPOS.Contains(tags[i])) { // Current token POS is not excluded from conversion. if (Regex.IsMatch(chunks[i], "-") && ConversionConditions.IncludedPhrases.Contains(Regex.Split(chunks[i], "-")[1])) { // The containing phrase of the current token is not excluded. switch (tags[i]) { case "NN": case "NNS": pos = Wnlib.PartsOfSpeech.Noun; break; case "JJ": case "JJR": case "JJS": pos = Wnlib.PartsOfSpeech.Adj; break; case "RB": case "RBR": case "RBS": pos = Wnlib.PartsOfSpeech.Adv; break; case "VB": case "VBD": case "VBG": case "VBN": case "VBP": case "VBZ": pos = Wnlib.PartsOfSpeech.Verb; break; } string mostComplexSynonym = GetMostComplexSynyonymScoredWN(tokens[i], pos); output.Append(mostComplexSynonym); } else { // The containing phrase of the current token is excluded. output.Append(tokens[i]); } } else { // Current token POS is excluded from conversion. output.Append(tokens[i]); } // Checking if a space needs to be added after this token (eg, it is not at the end of the line). // NOTE: Uses two inline if statements. bool isBeforePunctuation; try { isBeforePunctuation = Regex.IsMatch(tokens[i + 1], IS_BEFORE_PUNCTUATION_MATCH_PATTERN); } catch (IndexOutOfRangeException) { isBeforePunctuation = false; } output.Append((i >= tokens.Length - (sentence.EndsWith(".") ? 2 : 1)) || isBeforePunctuation ? "" : " "); if (tokens[i] == ".") { output.Append(Array.IndexOf(sentences, sentence) == (sentences.Length - 1) ? "" : " "); } try { if ((chunks[i + 1] == "O" && tokens[i + 1].Contains("'")) || tokens[i + 1] == "'s") { // This is a contraction. Remove the space between the two parts. output.Length--; } } catch (IndexOutOfRangeException) { /* Don't need to do anything, just means we don't need to remove the last space. */ } } } return(AddPeriod(StringToSentenceCase(output.ToString()))); }