private static WordInfo lookupWord(string word) { // OVERVIEW: For each part of speech, look for the word. // Compare relative strengths of the synsets in each category // to determine the most probable part of speech. // // PROBLEM: Word definitions are often context-based. It would be better // to find a way to search in-context in stead of just singling // out an individual word. // // SOLUTION: Modify FindPartOfSpeech to include a second argument, string // context. The pass the entire sentence as the context for part // of speech determination. // // PROBLEM: That's difficult to do so I'm going to keep this simple for now. int maxCount = 0; WordInfo wordinfo = new WordInfo(); wordinfo.partOfSpeech = Wnlib.PartsOfSpeech.Unknown; // for each part of speech... var enums = (Wnlib.PartsOfSpeech[])Enum.GetValues(typeof(Wnlib.PartsOfSpeech)); wordinfo.SenseCounts = new int[enums.Length]; for (int i = 0; i < enums.Length; i++) { // get a valid part of speech Wnlib.PartsOfSpeech pos = enums[i]; if (pos == Wnlib.PartsOfSpeech.Unknown) { continue; } // get an index to a synset collection Wnlib.Index index = Wnlib.Index.lookup(word, Wnlib.PartOfSpeech.of(pos)); // none found? if (index == null) { continue; } // does this part of speech have a higher sense count? wordinfo.SenseCounts[i] = index.SenseCnt; if (wordinfo.SenseCounts[i] > maxCount) { maxCount = wordinfo.SenseCounts[i]; wordinfo.partOfSpeech = pos; } } return(wordinfo); }
/// <summary> /// Gets all synonyms of the given word in the given part of speech and finds the most complex word in the set. /// Uses WordNet as thesaurus and weighted scores to determine which is most complex. /// </summary> /// <param name="word">Word to get synonyms of.</param> /// <param name="pos">Part of speech of the given word. Used to find more accurate synonyms.</param> /// <returns>The most complex synonym of the given word in the given part of speech.</returns> public static string GetMostComplexSynyonymScoredWN(string word, Wnlib.PartsOfSpeech pos) { // TODO: Lemmatization? if (pos == Wnlib.PartsOfSpeech.Unknown) { // We're gonna have some serious problems (namely, a NullReferenceException) if we don't back out of this right now. return(word); } string[] synonymsArr = Lexicon.FindSynonyms(word, pos, false); if (synonymsArr == null || synonymsArr.Length == 0) { return(word); } List <string> synonyms = new List <string>(synonymsArr); string mostComplexSynyonym = ""; double mostComplexScore = 0.0; #region Synonym collection setup if (!synonyms.Contains <string>(word)) { synonyms.Add(word); } #endregion #region Most complex synonym find foreach (string cs in synonyms) { double csScore = WordRater.GetTotalScore(cs); if (mostComplexSynyonym == "" || csScore > mostComplexScore) { mostComplexSynyonym = cs; mostComplexScore = csScore; } } #endregion return(mostComplexSynyonym); }
/// <summary>Returns a list of Synonyms for a given word</summary> /// <param name="word">the word</param> /// <param name="pos">The Part of speech of a word</param> /// <param name="includeMorphs">include morphology? (fuzzy matching)</param> /// <returns>An array of strings containing the synonyms found</returns> /// <remarks> /// Note that my usage of 'Synonyms' here is not the same as hypernyms as defined by /// WordNet. Synonyms in this sense are merely words in the same SynSet as the given /// word. Hypernyms are found by tracing the pointers in a given synset. /// </remarks> public static string[] FindSynonyms(string word, Wnlib.PartsOfSpeech pos, bool includeMorphs) { // get an index to a synset collection word = word.ToLower(); Wnlib.Index index = Wnlib.Index.lookup(word, Wnlib.PartOfSpeech.of(pos)); // none found? if (index == null) { if (!includeMorphs) { return(null); } // check morphs var morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos)); string morph = ""; while ((morph = morphs.next()) != null) { index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos)); if (index != null) { break; } } } // still none found? if (index == null) { return(null); } // at this point we will always have a valid index return(lookupSynonyms(index)); }
private void cmdLookupPOS_Click(object sender, System.EventArgs e) { txtOut.Text = ""; // perform a part-of-speech lookup using the Lexicon WnLexicon.WordInfo wordinfo = WnLexicon.Lexicon.FindWordInfo(txtWord.Text, chkMorphs.Checked); // NOTE: Including morphology matches only changes the output when no direct matches have been found. // no match? if (wordinfo.partOfSpeech == Wnlib.PartsOfSpeech.Unknown) //if( wordinfo.Strength == 0 ) ^ same as above ^ { txtOut.AppendText("No Match found!\r\n"); return; } // for each part of speech... Wnlib.PartsOfSpeech[] enums = (Wnlib.PartsOfSpeech[])Enum.GetValues(typeof(Wnlib.PartsOfSpeech)); txtOut.AppendText("\r\nSense Counts:\r\n"); for (int i = 0; i < enums.Length; i++) { Wnlib.PartsOfSpeech pos = enums[i]; // skip "Unknown" if (pos == Wnlib.PartsOfSpeech.Unknown) { continue; } // output sense counts txtOut.AppendText(String.Format("{0,12}: {1}\r\n", pos, wordinfo.senseCounts[i])); } txtOut.AppendText(String.Format("\r\nProbable Part Of Speech: {0}\r\n", wordinfo.partOfSpeech)); }
private static WordInfo LookupWordMorphs(string word) { // OVERVIEW: This functions only gets called when the word was not found with // an exact match. So, enumerate all the parts of speech, then enumerate // all of the word's morphs in each category. Perform a lookup on each // morph and save the morph/strength/part-of-speech data sets. Finally, // loop over all the data sets and then pick the strongest one. ArrayList wordinfos = new ArrayList(); // for each part of speech... for (int i = 0; i < Enums.Length; i++) { // get a valid part of speech Wnlib.PartsOfSpeech pos = Enums[i]; if (pos == Wnlib.PartsOfSpeech.Unknown) { continue; } // generate morph list Wnlib.MorphStr morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos)); string morph = ""; while ((morph = morphs.next()) != null) { // get an index to a synset collection Wnlib.Index index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos)); // none found? if (index == null) { continue; } // save the wordinfo WordInfo wordinfo = GetMorphInfo(wordinfos, morph); wordinfo.SenseCounts[i] = index.SenseCnt; } } // search the wordinfo list for the best match WordInfo bestWordInfo = new WordInfo(); int maxStrength = 0; foreach (WordInfo wordinfo in wordinfos) { // for each part of speech... int maxSenseCount = 0; int strength = 0; for (int i = 0; i < Enums.Length; i++) { // get a valid part of speech Wnlib.PartsOfSpeech pos = Enums[i]; if (pos == Wnlib.PartsOfSpeech.Unknown) { continue; } // determine part of speech and strength strength += wordinfo.SenseCounts[i]; if (wordinfo.SenseCounts[i] > maxSenseCount) { maxSenseCount = wordinfo.SenseCounts[i]; wordinfo.partOfSpeech = pos; } } // best match? if (strength > maxStrength) { maxStrength = strength; bestWordInfo = wordinfo; } } return(bestWordInfo); }
private static WordInfo lookupWordMorphs(string word, bool tagged_only) { // OVERVIEW: This functions only gets called when the word was not found with // an exact match. So, enumerate all the parts of speech, then enumerate // all of the word's morphs in each category. Perform a lookup on each // morph and save the morph/strength/part-of-speech data sets. Finally, // loop over all the data sets and then pick the strongest one. ArrayList wordinfos = new ArrayList(); // for each part of speech... for (int i = 0; i < enums.Length; i++) { // get a valid part of speech Wnlib.PartsOfSpeech pos = enums[i]; if (pos == Wnlib.PartsOfSpeech.Unknown) { continue; } // generate morph list Wnlib.MorphStr morphs = new Wnlib.MorphStr(word, Wnlib.PartOfSpeech.of(pos)); string morph = ""; while ((morph = morphs.next()) != null) { // get an index to a synset collection Wnlib.Index index = Wnlib.Index.lookup(morph, Wnlib.PartOfSpeech.of(pos)); // none found? if (index == null) { continue; } // none tagged if (tagged_only && index.tagsense_cnt == 0) { continue; } // save the wordinfo WordInfo wordinfo = getMorphInfo(wordinfos, morph); if (tagged_only) { wordinfo.senseCounts[i] = index.tagsense_cnt; } else { wordinfo.senseCounts[i] = index.sense_cnt; } } } return(WordInfo.Compine(wordinfos)); /* * // search the wordinfo list for the best match * WordInfo bestWordInfo = new WordInfo(); * int maxStrength = 0; * foreach( WordInfo wordinfo in wordinfos ) * { * // for each part of speech... * int maxSenseCount = 0; * int strength = 0; * for( int i=0; i<enums.Length; i++ ) * { * // get a valid part of speech * Wnlib.PartsOfSpeech pos = enums[i]; * if( pos == Wnlib.PartsOfSpeech.Unknown ) * continue; * * // determine part of speech and strength * strength += wordinfo.senseCounts[i]; * if( wordinfo.senseCounts[i] > maxSenseCount ) * { * maxSenseCount = wordinfo.senseCounts[i]; * wordinfo.partOfSpeech = pos; * } * } * * // best match? * if( strength > maxStrength ) * { * maxStrength = strength; * bestWordInfo = wordinfo; * } * } * * return bestWordInfo; */ }
/// <summary> /// Take in a paragraph and replace all non-ignored words with a 'smarter' synonym. /// </summary> /// <param name="data">Paragraph to convert.</param> /// <returns>The 'improved' paragraph.</returns> public static string ConvertParagraph(string data) { StringBuilder output = new StringBuilder(); string[] sentences = MEDetector.Detect(data); foreach (string sentence in sentences) { string[] tokens = METokenizer.Tokenize(sentence); Span[] names = MENameFinder.Find(tokens); char[] sentenceArr = sentence.ToCharArray(); for (int cCharIndex = 0; cCharIndex < sentence.Length; cCharIndex++) { if (Char.IsUpper(sentenceArr[cCharIndex])) { bool isName = false; for (int cSpanIndex = 0; cSpanIndex < names.Length; cSpanIndex++) { if (cCharIndex == names[cSpanIndex].Start) { isName = true; } } if (!isName) { sentenceArr[cCharIndex] = Char.ToLower(sentenceArr[cCharIndex]); // TODO: Have to keep track of where the capitals were in the original sentence to add them again later. } } } tokens = METokenizer.Tokenize(new string(sentenceArr)); string[] tags = METagger.Tag(tokens); string[] chunks = MEChunker.Chunk(tokens, tags); Wnlib.PartsOfSpeech pos = Wnlib.PartsOfSpeech.Unknown; for (int i = 0; i < tokens.Length; i++) { if (!ConversionConditions.ExcludedPOS.Contains(tags[i])) { // Current token POS is not excluded from conversion. if (Regex.IsMatch(chunks[i], "-") && ConversionConditions.IncludedPhrases.Contains(Regex.Split(chunks[i], "-")[1])) { // The containing phrase of the current token is not excluded. switch (tags[i]) { case "NN": case "NNS": pos = Wnlib.PartsOfSpeech.Noun; break; case "JJ": case "JJR": case "JJS": pos = Wnlib.PartsOfSpeech.Adj; break; case "RB": case "RBR": case "RBS": pos = Wnlib.PartsOfSpeech.Adv; break; case "VB": case "VBD": case "VBG": case "VBN": case "VBP": case "VBZ": pos = Wnlib.PartsOfSpeech.Verb; break; } string mostComplexSynonym = GetMostComplexSynyonymScoredWN(tokens[i], pos); output.Append(mostComplexSynonym); } else { // The containing phrase of the current token is excluded. output.Append(tokens[i]); } } else { // Current token POS is excluded from conversion. output.Append(tokens[i]); } // Checking if a space needs to be added after this token (eg, it is not at the end of the line). // NOTE: Uses two inline if statements. bool isBeforePunctuation; try { isBeforePunctuation = Regex.IsMatch(tokens[i + 1], IS_BEFORE_PUNCTUATION_MATCH_PATTERN); } catch (IndexOutOfRangeException) { isBeforePunctuation = false; } output.Append((i >= tokens.Length - (sentence.EndsWith(".") ? 2 : 1)) || isBeforePunctuation ? "" : " "); if (tokens[i] == ".") { output.Append(Array.IndexOf(sentences, sentence) == (sentences.Length - 1) ? "" : " "); } try { if ((chunks[i + 1] == "O" && tokens[i + 1].Contains("'")) || tokens[i + 1] == "'s") { // This is a contraction. Remove the space between the two parts. output.Length--; } } catch (IndexOutOfRangeException) { /* Don't need to do anything, just means we don't need to remove the last space. */ } } } return(AddPeriod(StringToSentenceCase(output.ToString()))); }