Exemplo n.º 1
        /// <summary>
        /// Processes the specified text.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns>The resulting document object.</returns>
        public Document Process(string text)
            var TempText = NormalizerManager.Normalize(text);
            var Tokens   = Tokenizer.Tokenize(TempText, TokenizerLanguage);

            Tokens = NormalizerManager.Normalize(Tokens);
            Tokens = Stemmer.Stem(Tokens, StemmerLanguage);
            Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage);

            var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            for (int x = 0; x < Sentences.Length; ++x)
                var Sentence = Sentences[x];
                Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage);
            Tokens    = EntityFinder.Find(Tokens, EntityFinderType);
            Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage));
Exemplo n.º 2
        /// <summary>
        /// Take in a paragraph and replace all non-ignored words with a 'smarter' synonym.
        /// </summary>
        /// <param name="data">Paragraph to convert.</param>
        /// <returns>The 'improved' paragraph.</returns>
        public static string ConvertParagraph(string data)
            StringBuilder output = new StringBuilder();

            string[] sentences = MEDetector.Detect(data);

            foreach (string sentence in sentences)
                string[] tokens      = METokenizer.Tokenize(sentence);
                Span[]   names       = MENameFinder.Find(tokens);
                char[]   sentenceArr = sentence.ToCharArray();
                for (int cCharIndex = 0; cCharIndex < sentence.Length; cCharIndex++)
                    if (Char.IsUpper(sentenceArr[cCharIndex]))
                        bool isName = false;
                        for (int cSpanIndex = 0; cSpanIndex < names.Length; cSpanIndex++)
                            if (cCharIndex == names[cSpanIndex].Start)
                                isName = true;

                        if (!isName)
                            sentenceArr[cCharIndex] = Char.ToLower(sentenceArr[cCharIndex]);
                            // TODO: Have to keep track of where the capitals were in the original sentence to add them again later.
                tokens = METokenizer.Tokenize(new string(sentenceArr));
                string[] tags = METagger.Tag(tokens);

                string[] chunks = MEChunker.Chunk(tokens, tags);

                Wnlib.PartsOfSpeech pos = Wnlib.PartsOfSpeech.Unknown;
                for (int i = 0; i < tokens.Length; i++)
                    if (!ConversionConditions.ExcludedPOS.Contains(tags[i]))
                        // Current token POS is not excluded from conversion.
                        if (Regex.IsMatch(chunks[i], "-") && ConversionConditions.IncludedPhrases.Contains(Regex.Split(chunks[i], "-")[1]))
                            // The containing phrase of the current token is not excluded.
                            switch (tags[i])
                            case "NN":
                            case "NNS":
                                pos = Wnlib.PartsOfSpeech.Noun;

                            case "JJ":
                            case "JJR":
                            case "JJS":
                                pos = Wnlib.PartsOfSpeech.Adj;

                            case "RB":
                            case "RBR":
                            case "RBS":
                                pos = Wnlib.PartsOfSpeech.Adv;

                            case "VB":
                            case "VBD":
                            case "VBG":
                            case "VBN":
                            case "VBP":
                            case "VBZ":
                                pos = Wnlib.PartsOfSpeech.Verb;

                            string mostComplexSynonym = GetMostComplexSynyonymScoredWN(tokens[i], pos);
                            // The containing phrase of the current token is excluded.
                        // Current token POS is excluded from conversion.

                    // Checking if a space needs to be added after this token (eg, it is not at the end of the line).
                    // NOTE: Uses two inline if statements.
                    bool isBeforePunctuation;
                        isBeforePunctuation = Regex.IsMatch(tokens[i + 1], IS_BEFORE_PUNCTUATION_MATCH_PATTERN);
                    catch (IndexOutOfRangeException)
                        isBeforePunctuation = false;

                    output.Append((i >= tokens.Length - (sentence.EndsWith(".") ? 2 : 1)) || isBeforePunctuation ? "" : " ");
                    if (tokens[i] == ".")
                        output.Append(Array.IndexOf(sentences, sentence) == (sentences.Length - 1) ? "" : " ");

                        if ((chunks[i + 1] == "O" && tokens[i + 1].Contains("'")) || tokens[i + 1] == "'s")
                            // This is a contraction. Remove the space between the two parts.
                    catch (IndexOutOfRangeException)
                    { /* Don't need to do anything, just means we don't need to remove the last space. */ }
