public WordNetManager() { wordNetEngine = new WordNetEngine(ReferringManager.Instance.WordNetDirectory, false); }
/// <summary> /// Gets synsets related to the current synset /// </summary> /// <param name="relation">Synset relation to follow</param> /// <param name="recursive">Whether or not to follow the relation recursively for all related synsets</param> /// <returns>Synsets related to the given one by the given relation</returns> public Set<SynSet> GetRelatedSynSets(WordNetEngine.SynSetRelation relation, bool recursive) { return GetRelatedSynSets(new WordNetEngine.SynSetRelation[] { relation }, recursive); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param> /// <returns>Synset shells for the given index line</returns> private static Set<SynSet> GetSynSetShells(string wordIndexLine, POS pos, out SynSet mostCommonSynSet, WordNetEngine wordNetEngine) { Set<SynSet> synsets = new Set<SynSet>(); mostCommonSynSet = null; // get number of synsets string[] parts = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection SynSet synset = new SynSet(pos, offset, wordNetEngine); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) mostCommonSynSet = synset; } if (mostCommonSynSet == null) throw new Exception("Failed to get most common synset"); return synsets; }
/// <summary> /// Gets the number of synsets related to the current one by the given relation /// </summary> /// <param name="relation">Relation to check</param> /// <returns>Number of synset related to the current one by the given relation</returns> public int GetRelatedSynSetCount(WordNetEngine.SynSetRelation relation) { if (!_relationSynSets.ContainsKey(relation)) return 0; return _relationSynSets[relation].Count; }
/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary<string, SynSet> idSynset) { // don't re-instantiate if (_instantiated) throw new Exception("Synset has already been instantiated"); /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */ int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1; if (lexicographerFileNumber <= 0) throw new Exception("Invalid lexicographer file name number. Should be >= 1."); _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber; // get number of words in the synset and the start character of the word list int wordStart; int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; // get words in synset _words = new List<string>(numWords); for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); if (word.Contains(' ')) throw new Exception("Unexpected space in word: " + word); _words.Add(word); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); if (_gloss.Contains('|')) throw new Exception("Unexpected pipe in gloss"); // get number and start of relations int relationCountField = 3 + (_words.Count * 2) + 1; int relationFieldStart; int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset _relationSynSets = new Dictionary<WordNetEngine.SynSetRelation, Set<SynSet>>(); _lexicalRelations = new Dictionary<WordNetEngine.SynSetRelation, Dictionary<SynSet, Dictionary<int, Set<int>>>>(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) relationSymbol = fieldValue; // related synset offset else if (relationField == 1) relatedSynSetOffset = int.Parse(fieldValue); // related synset POS else if (relationField == 2) relatedSynSetPOS = GetPOS(fieldValue); // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else throw new Exception(); relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine); // look up related synset directly else relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; // get relation WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { _relationSynSets.EnsureContainsKey(relation, typeof(Set<SynSet>)); _relationSynSets[relation].Add(relatedSynSet); } // add lexical relation else { _lexicalRelations.EnsureContainsKey(relation, typeof(Dictionary<SynSet, Dictionary<int, Set<int>>>)); _lexicalRelations[relation].EnsureContainsKey(relatedSynSet, typeof(Dictionary<int, Set<int>>)); _lexicalRelations[relation][relatedSynSet].EnsureContainsKey(sourceWordIndex, typeof(Set<int>)); if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex)) _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex); } } // release the wordnet engine if we have one...don't need it anymore if (_wordNetEngine != null) _wordNetEngine = null; _instantiated = true; }
/// <summary> /// Constructor. Creates the shell of a SynSet without any actual information. To gain access to SynSet words, gloss, /// and related SynSets, call SynSet.Instantiate. /// </summary> /// <param name="pos">POS of SynSet</param> /// <param name="offset">Byte location of SynSet definition within data file</param> /// <param name="wordNetEngine">WordNet engine used to instantiate this synset. This should be non-null only when constructing /// synsets for disk-based WordNet engines.</param> internal SynSet(WordNetEngine.POS pos, int offset, WordNetEngine wordNetEngine) { _pos = pos; _offset = offset; _wordNetEngine = wordNetEngine; _instantiated = false; if (_wordNetEngine != null && _wordNetEngine.InMemory) throw new Exception("Don't need to pass a non-null WordNetEngine when using in-memory storage"); // precompute the ID and hash code for efficiency _id = _pos + ":" + _offset; _hashCode = _id.GetHashCode(); }
/// <summary> /// Gets similarity of two strings using the most common synset for given string/pos pairs /// </summary> /// <param name="string1">First string</param> /// <param name="pos1">First POS</param> /// <param name="pos2">Second POS</param> /// <param name="string2">Second string</param> /// <param name="strategy">Similarity strategy to use</param> /// <param name="relations">Relations to use when computing similarity</param> /// <returns>Similarity</returns> public float GetSimilarity(string string1, WordNetEngine.POS pos1, string string2, WordNetEngine.POS pos2, Strategy strategy, params WordNetEngine.SynSetRelation[] relations) { float similarity = 0; if (strategy == Strategy.WuPalmer1994Average) { // get average similarity across all synsets int numScores = 0; foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { similarity += GetSimilarity(synset1, synset2, strategy, relations); ++numScores; } if (numScores > 0) similarity = similarity / (float)numScores; } else if (strategy == Strategy.WuPalmer1994Maximum) { // get maximum similarity across all synsets foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { float currSim = GetSimilarity(synset1, synset2, strategy, relations); if (currSim > similarity) similarity = currSim; } } else if (strategy == Strategy.WuPalmer1994Minimum) { // get minimum similarity across all synsets similarity = -1; foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { float currSim = GetSimilarity(synset1, synset2, strategy, relations); if (similarity == -1 || currSim < similarity) similarity = currSim; } // if we didn't find any synsets, similarity is zero if (similarity == -1) similarity = 0; } else if (strategy == Strategy.WuPalmer1994MostCommon) { // use most common synsets SynSet synset1 = _wordNetEngine.GetMostCommonSynSet(string1, pos1); SynSet synset2 = _wordNetEngine.GetMostCommonSynSet(string2, pos2); if (synset1 != null && synset2 != null) similarity = GetSimilarity(synset1, synset2, strategy, relations); } else throw new NotImplementedException("Unimplemented strategy: " + strategy); if (similarity < 0 || similarity > 1) throw new Exception("Invalid similarity: " + similarity); return similarity; }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetEngine">WordNet engine to use</param> public WordNetSimilarityModel(WordNetEngine wordNetEngine) { _wordNetEngine = wordNetEngine; }