Beispiel #1
0
        private static WordInfo lookupWord(string word)
        {
            // OVERVIEW: For each part of speech, look for the word.
            //           Compare relative strengths of the synsets in each category
            //			 to determine the most probable part of speech.
            //
            // PROBLEM:  Word definitions are often context-based. It would be better
            //           to find a way to search in-context in stead of just singling
            //           out an individual word.
            //
            // SOLUTION: Modify FindPartOfSpeech to include a second argument, string
            //           context. The pass the entire sentence as the context for part
            //           of speech determination.
            //
            // PROBLEM:  That's difficult to do so I'm going to keep this simple for now.

            int      maxCount = 0;
            WordInfo wordinfo = new WordInfo();

            wordinfo.partOfSpeech = Wnlib.PartsOfSpeech.Unknown;

            // for each part of speech...
            var enums = (Wnlib.PartsOfSpeech[])Enum.GetValues(typeof(Wnlib.PartsOfSpeech));

            wordinfo.SenseCounts = new int[enums.Length];
            for (int i = 0; i < enums.Length; i++)
            {
                // get a valid part of speech
                Wnlib.PartsOfSpeech pos = enums[i];
                if (pos == Wnlib.PartsOfSpeech.Unknown)
                {
                    continue;
                }

                // get an index to a synset collection
                Wnlib.Index index = Wnlib.Index.lookup(word, Wnlib.PartOfSpeech.of(pos));

                // none found?
                if (index == null)
                {
                    continue;
                }

                // does this part of speech have a higher sense count?
                wordinfo.SenseCounts[i] = index.SenseCnt;
                if (wordinfo.SenseCounts[i] > maxCount)
                {
                    maxCount = wordinfo.SenseCounts[i];
                    wordinfo.partOfSpeech = pos;
                }
            }

            return(wordinfo);
        }
Beispiel #2
0
        /// <summary>
        /// Gets all synonyms of the given word in the given part of speech and finds the most complex word in the set.
        /// Uses WordNet as thesaurus and weighted scores to determine which is most complex.
        /// </summary>
        /// <param name="word">Word to get synonyms of.</param>
        /// <param name="pos">Part of speech of the given word. Used to find more accurate synonyms.</param>
        /// <returns>The most complex synonym of the given word in the given part of speech.</returns>
        public static string GetMostComplexSynyonymScoredWN(string word, Wnlib.PartsOfSpeech pos)
        {
            // TODO: Lemmatization?

            if (pos == Wnlib.PartsOfSpeech.Unknown)
            {
                // We're gonna have some serious problems (namely, a NullReferenceException) if we don't back out of this right now.
                return(word);
            }

            string[] synonymsArr = Lexicon.FindSynonyms(word, pos, false);

            if (synonymsArr == null || synonymsArr.Length == 0)
            {
                return(word);
            }

            List <string> synonyms            = new List <string>(synonymsArr);
            string        mostComplexSynyonym = "";
            double        mostComplexScore    = 0.0;

            #region Synonym collection setup
            if (!synonyms.Contains <string>(word))
            {
                synonyms.Add(word);
            }
            #endregion

            #region Most complex synonym find
            foreach (string cs in synonyms)
            {
                double csScore = WordRater.GetTotalScore(cs);
                if (mostComplexSynyonym == "" || csScore > mostComplexScore)
                {
                    mostComplexSynyonym = cs;
                    mostComplexScore    = csScore;
                }
            }
            #endregion

            return(mostComplexSynyonym);
        }
Beispiel #3
0
        /// <summary>Returns a list of Synonyms for a given word</summary>
        /// <param name="word">the word</param>
        /// <param name="pos">The Part of speech of a word</param>
        /// <param name="includeMorphs">include morphology? (fuzzy matching)</param>
        /// <returns>An array of strings containing the synonyms found</returns>
        /// <remarks>
        /// Note that my usage of 'Synonyms' here is not the same as hypernyms as defined by
        /// WordNet. Synonyms in this sense are merely words in the same SynSet as the given
        /// word. Hypernyms are found by tracing the pointers in a given synset.
        /// </remarks>
        public static string[] FindSynonyms(string word, Wnlib.PartsOfSpeech pos, bool includeMorphs)
        {
            // get an index to a synset collection
            word = word.ToLower();
            Wnlib.Index index = Wnlib.Index.lookup(word, Wnlib.PartOfSpeech.of(pos));

            // none found?
            if (index == null)
            {
                if (!includeMorphs)
                {
                    return(null);
                }

                // check morphs
                var    morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos));
                string morph  = "";
                while ((morph = morphs.next()) != null)
                {
                    index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos));
                    if (index != null)
                    {
                        break;
                    }
                }
            }

            // still none found?
            if (index == null)
            {
                return(null);
            }

            // at this point we will always have a valid index
            return(lookupSynonyms(index));
        }
Beispiel #4
0
        private void cmdLookupPOS_Click(object sender, System.EventArgs e)
        {
            txtOut.Text = "";

            // perform a part-of-speech lookup using the Lexicon
            WnLexicon.WordInfo wordinfo = WnLexicon.Lexicon.FindWordInfo(txtWord.Text, chkMorphs.Checked);

            // NOTE: Including morphology matches only changes the output when no direct matches have been found.

            // no match?
            if (wordinfo.partOfSpeech == Wnlib.PartsOfSpeech.Unknown)
            //if( wordinfo.Strength == 0 )  ^ same as above ^
            {
                txtOut.AppendText("No Match found!\r\n");
                return;
            }

            // for each part of speech...
            Wnlib.PartsOfSpeech[] enums = (Wnlib.PartsOfSpeech[])Enum.GetValues(typeof(Wnlib.PartsOfSpeech));
            txtOut.AppendText("\r\nSense Counts:\r\n");
            for (int i = 0; i < enums.Length; i++)
            {
                Wnlib.PartsOfSpeech pos = enums[i];

                // skip "Unknown"
                if (pos == Wnlib.PartsOfSpeech.Unknown)
                {
                    continue;
                }

                // output sense counts
                txtOut.AppendText(String.Format("{0,12}: {1}\r\n", pos, wordinfo.senseCounts[i]));
            }

            txtOut.AppendText(String.Format("\r\nProbable Part Of Speech: {0}\r\n", wordinfo.partOfSpeech));
        }
Beispiel #5
0
        private static WordInfo LookupWordMorphs(string word)
        {
            // OVERVIEW: This functions only gets called when the word was not found with
            //           an exact match. So, enumerate all the parts of speech, then enumerate
            //           all of the word's morphs in each category. Perform a lookup on each
            //           morph and save the morph/strength/part-of-speech data sets. Finally,
            //           loop over all the data sets and then pick the strongest one.

            ArrayList wordinfos = new ArrayList();

            // for each part of speech...
            for (int i = 0; i < Enums.Length; i++)
            {
                // get a valid part of speech
                Wnlib.PartsOfSpeech pos = Enums[i];
                if (pos == Wnlib.PartsOfSpeech.Unknown)
                {
                    continue;
                }

                // generate morph list
                Wnlib.MorphStr morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos));
                string         morph  = "";
                while ((morph = morphs.next()) != null)
                {
                    // get an index to a synset collection
                    Wnlib.Index index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos));

                    // none found?
                    if (index == null)
                    {
                        continue;
                    }

                    // save the wordinfo
                    WordInfo wordinfo = GetMorphInfo(wordinfos, morph);
                    wordinfo.SenseCounts[i] = index.SenseCnt;
                }
            }

            // search the wordinfo list for the best match
            WordInfo bestWordInfo = new WordInfo();
            int      maxStrength  = 0;

            foreach (WordInfo wordinfo in wordinfos)
            {
                // for each part of speech...
                int maxSenseCount = 0;
                int strength      = 0;
                for (int i = 0; i < Enums.Length; i++)
                {
                    // get a valid part of speech
                    Wnlib.PartsOfSpeech pos = Enums[i];
                    if (pos == Wnlib.PartsOfSpeech.Unknown)
                    {
                        continue;
                    }

                    // determine part of speech and strength
                    strength += wordinfo.SenseCounts[i];
                    if (wordinfo.SenseCounts[i] > maxSenseCount)
                    {
                        maxSenseCount         = wordinfo.SenseCounts[i];
                        wordinfo.partOfSpeech = pos;
                    }
                }

                // best match?
                if (strength > maxStrength)
                {
                    maxStrength  = strength;
                    bestWordInfo = wordinfo;
                }
            }

            return(bestWordInfo);
        }
Beispiel #6
0
        private static WordInfo lookupWordMorphs(string word, bool tagged_only)
        {
            // OVERVIEW: This functions only gets called when the word was not found with
            //           an exact match. So, enumerate all the parts of speech, then enumerate
            //           all of the word's morphs in each category. Perform a lookup on each
            //           morph and save the morph/strength/part-of-speech data sets. Finally,
            //           loop over all the data sets and then pick the strongest one.

            ArrayList wordinfos = new ArrayList();

            // for each part of speech...
            for (int i = 0; i < enums.Length; i++)
            {
                // get a valid part of speech
                Wnlib.PartsOfSpeech pos = enums[i];
                if (pos == Wnlib.PartsOfSpeech.Unknown)
                {
                    continue;
                }

                // generate morph list
                Wnlib.MorphStr morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos));
                string         morph  = "";
                while ((morph = morphs.next()) != null)
                {
                    // get an index to a synset collection
                    Wnlib.Index index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos));

                    // none found?
                    if (index == null)
                    {
                        continue;
                    }
                    // none tagged
                    if (tagged_only && index.tagsense_cnt == 0)
                    {
                        continue;
                    }

                    // save the wordinfo
                    WordInfo wordinfo = getMorphInfo(wordinfos, morph);
                    if (tagged_only)
                    {
                        wordinfo.senseCounts[i] = index.tagsense_cnt;
                    }
                    else
                    {
                        wordinfo.senseCounts[i] = index.sense_cnt;
                    }
                }
            }

            return(WordInfo.Compine(wordinfos));

/*
 *                      // search the wordinfo list for the best match
 *                      WordInfo bestWordInfo = new WordInfo();
 *                      int maxStrength = 0;
 *                      foreach( WordInfo wordinfo in wordinfos )
 *                      {
 *                              // for each part of speech...
 *                              int maxSenseCount = 0;
 *                              int strength = 0;
 *                              for( int i=0; i<enums.Length; i++ )
 *                              {
 *                                      // get a valid part of speech
 *                                      Wnlib.PartsOfSpeech pos = enums[i];
 *                                      if( pos == Wnlib.PartsOfSpeech.Unknown )
 *                                              continue;
 *
 *                                      // determine part of speech and strength
 *                                      strength += wordinfo.senseCounts[i];
 *                                      if( wordinfo.senseCounts[i] > maxSenseCount )
 *                                      {
 *                                              maxSenseCount = wordinfo.senseCounts[i];
 *                                              wordinfo.partOfSpeech = pos;
 *                                      }
 *                              }
 *
 *                              // best match?
 *                              if( strength > maxStrength )
 *                              {
 *                                      maxStrength = strength;
 *                                      bestWordInfo = wordinfo;
 *                              }
 *                      }
 *
 *                      return bestWordInfo;
 */
        }
Beispiel #7
0
        /// <summary>
        /// Take in a paragraph and replace all non-ignored words with a 'smarter' synonym.
        /// </summary>
        /// <param name="data">Paragraph to convert.</param>
        /// <returns>The 'improved' paragraph.</returns>
        public static string ConvertParagraph(string data)
        {
            StringBuilder output = new StringBuilder();

            string[] sentences = MEDetector.Detect(data);

            foreach (string sentence in sentences)
            {
                string[] tokens      = METokenizer.Tokenize(sentence);
                Span[]   names       = MENameFinder.Find(tokens);
                char[]   sentenceArr = sentence.ToCharArray();
                for (int cCharIndex = 0; cCharIndex < sentence.Length; cCharIndex++)
                {
                    if (Char.IsUpper(sentenceArr[cCharIndex]))
                    {
                        bool isName = false;
                        for (int cSpanIndex = 0; cSpanIndex < names.Length; cSpanIndex++)
                        {
                            if (cCharIndex == names[cSpanIndex].Start)
                            {
                                isName = true;
                            }
                        }

                        if (!isName)
                        {
                            sentenceArr[cCharIndex] = Char.ToLower(sentenceArr[cCharIndex]);
                            // TODO: Have to keep track of where the capitals were in the original sentence to add them again later.
                        }
                    }
                }
                tokens = METokenizer.Tokenize(new string(sentenceArr));
                string[] tags = METagger.Tag(tokens);

                string[] chunks = MEChunker.Chunk(tokens, tags);

                Wnlib.PartsOfSpeech pos = Wnlib.PartsOfSpeech.Unknown;
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (!ConversionConditions.ExcludedPOS.Contains(tags[i]))
                    {
                        // Current token POS is not excluded from conversion.
                        if (Regex.IsMatch(chunks[i], "-") && ConversionConditions.IncludedPhrases.Contains(Regex.Split(chunks[i], "-")[1]))
                        {
                            // The containing phrase of the current token is not excluded.
                            switch (tags[i])
                            {
                            case "NN":
                            case "NNS":
                                pos = Wnlib.PartsOfSpeech.Noun;
                                break;

                            case "JJ":
                            case "JJR":
                            case "JJS":
                                pos = Wnlib.PartsOfSpeech.Adj;
                                break;

                            case "RB":
                            case "RBR":
                            case "RBS":
                                pos = Wnlib.PartsOfSpeech.Adv;
                                break;

                            case "VB":
                            case "VBD":
                            case "VBG":
                            case "VBN":
                            case "VBP":
                            case "VBZ":
                                pos = Wnlib.PartsOfSpeech.Verb;
                                break;
                            }

                            string mostComplexSynonym = GetMostComplexSynyonymScoredWN(tokens[i], pos);
                            output.Append(mostComplexSynonym);
                        }
                        else
                        {
                            // The containing phrase of the current token is excluded.
                            output.Append(tokens[i]);
                        }
                    }
                    else
                    {
                        // Current token POS is excluded from conversion.
                        output.Append(tokens[i]);
                    }

                    // Checking if a space needs to be added after this token (eg, it is not at the end of the line).
                    // NOTE: Uses two inline if statements.
                    bool isBeforePunctuation;
                    try
                    {
                        isBeforePunctuation = Regex.IsMatch(tokens[i + 1], IS_BEFORE_PUNCTUATION_MATCH_PATTERN);
                    }
                    catch (IndexOutOfRangeException)
                    {
                        isBeforePunctuation = false;
                    }

                    output.Append((i >= tokens.Length - (sentence.EndsWith(".") ? 2 : 1)) || isBeforePunctuation ? "" : " ");
                    if (tokens[i] == ".")
                    {
                        output.Append(Array.IndexOf(sentences, sentence) == (sentences.Length - 1) ? "" : " ");
                    }

                    try
                    {
                        if ((chunks[i + 1] == "O" && tokens[i + 1].Contains("'")) || tokens[i + 1] == "'s")
                        {
                            // This is a contraction. Remove the space between the two parts.
                            output.Length--;
                        }
                    }
                    catch (IndexOutOfRangeException)
                    { /* Don't need to do anything, just means we don't need to remove the last space. */ }
                }
            }

            return(AddPeriod(StringToSentenceCase(output.ToString())));
        }