Пример #1
0
        /// <summary>
        /// Processes the specified text.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns>The resulting document object.</returns>
        public Document Process(string text)
        {
            var TempText = NormalizerManager.Normalize(text);
            var Tokens   = Tokenizer.Tokenize(TempText, TokenizerLanguage);

            Tokens = NormalizerManager.Normalize(Tokens);
            Tokens = Stemmer.Stem(Tokens, StemmerLanguage);
            Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage);

            var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            for (int x = 0; x < Sentences.Length; ++x)
            {
                var Sentence = Sentences[x];
                Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage);
            }
            Tokens    = EntityFinder.Find(Tokens, EntityFinderType);
            Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage));
        }
Пример #2
0
        /// <summary>
        /// Take in a paragraph and replace all non-ignored words with a 'smarter' synonym.
        /// </summary>
        /// <param name="data">Paragraph to convert.</param>
        /// <returns>The 'improved' paragraph.</returns>
        public static string ConvertParagraph(string data)
        {
            StringBuilder output = new StringBuilder();

            string[] sentences = MEDetector.Detect(data);

            foreach (string sentence in sentences)
            {
                string[] tokens      = METokenizer.Tokenize(sentence);
                Span[]   names       = MENameFinder.Find(tokens);
                char[]   sentenceArr = sentence.ToCharArray();
                for (int cCharIndex = 0; cCharIndex < sentence.Length; cCharIndex++)
                {
                    if (Char.IsUpper(sentenceArr[cCharIndex]))
                    {
                        bool isName = false;
                        for (int cSpanIndex = 0; cSpanIndex < names.Length; cSpanIndex++)
                        {
                            if (cCharIndex == names[cSpanIndex].Start)
                            {
                                isName = true;
                            }
                        }

                        if (!isName)
                        {
                            sentenceArr[cCharIndex] = Char.ToLower(sentenceArr[cCharIndex]);
                            // TODO: Have to keep track of where the capitals were in the original sentence to add them again later.
                        }
                    }
                }
                tokens = METokenizer.Tokenize(new string(sentenceArr));
                string[] tags = METagger.Tag(tokens);

                string[] chunks = MEChunker.Chunk(tokens, tags);

                Wnlib.PartsOfSpeech pos = Wnlib.PartsOfSpeech.Unknown;
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (!ConversionConditions.ExcludedPOS.Contains(tags[i]))
                    {
                        // Current token POS is not excluded from conversion.
                        if (Regex.IsMatch(chunks[i], "-") && ConversionConditions.IncludedPhrases.Contains(Regex.Split(chunks[i], "-")[1]))
                        {
                            // The containing phrase of the current token is not excluded.
                            switch (tags[i])
                            {
                            case "NN":
                            case "NNS":
                                pos = Wnlib.PartsOfSpeech.Noun;
                                break;

                            case "JJ":
                            case "JJR":
                            case "JJS":
                                pos = Wnlib.PartsOfSpeech.Adj;
                                break;

                            case "RB":
                            case "RBR":
                            case "RBS":
                                pos = Wnlib.PartsOfSpeech.Adv;
                                break;

                            case "VB":
                            case "VBD":
                            case "VBG":
                            case "VBN":
                            case "VBP":
                            case "VBZ":
                                pos = Wnlib.PartsOfSpeech.Verb;
                                break;
                            }

                            string mostComplexSynonym = GetMostComplexSynyonymScoredWN(tokens[i], pos);
                            output.Append(mostComplexSynonym);
                        }
                        else
                        {
                            // The containing phrase of the current token is excluded.
                            output.Append(tokens[i]);
                        }
                    }
                    else
                    {
                        // Current token POS is excluded from conversion.
                        output.Append(tokens[i]);
                    }

                    // Checking if a space needs to be added after this token (eg, it is not at the end of the line).
                    // NOTE: Uses two inline if statements.
                    bool isBeforePunctuation;
                    try
                    {
                        isBeforePunctuation = Regex.IsMatch(tokens[i + 1], IS_BEFORE_PUNCTUATION_MATCH_PATTERN);
                    }
                    catch (IndexOutOfRangeException)
                    {
                        isBeforePunctuation = false;
                    }

                    output.Append((i >= tokens.Length - (sentence.EndsWith(".") ? 2 : 1)) || isBeforePunctuation ? "" : " ");
                    if (tokens[i] == ".")
                    {
                        output.Append(Array.IndexOf(sentences, sentence) == (sentences.Length - 1) ? "" : " ");
                    }

                    try
                    {
                        if ((chunks[i + 1] == "O" && tokens[i + 1].Contains("'")) || tokens[i + 1] == "'s")
                        {
                            // This is a contraction. Remove the space between the two parts.
                            output.Length--;
                        }
                    }
                    catch (IndexOutOfRangeException)
                    { /* Don't need to do anything, just means we don't need to remove the last space. */ }
                }
            }

            return(AddPeriod(StringToSentenceCase(output.ToString())));
        }