/// <summary> /// Gets a synset /// </summary> /// <param name="synsetID">ID of synset in the format returned by SynSet.ID (i.e., POS:Offset)</param> /// <returns>SynSet</returns> public SynSet GetSynSet(string synsetID) { SynSet synset; if (_inMemory) { synset = _idSynset[synsetID]; } else { // get POS and offset int colonLoc = synsetID.IndexOf(':'); POS pos = (POS)Enum.Parse(typeof(POS), synsetID.Substring(0, colonLoc)); int offset = int.Parse(synsetID.Substring(colonLoc + 1)); // create shell and then instantiate synset = new SynSet(pos, offset, this); synset.Instantiate(); } return(synset); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param> /// <returns>Synset shells for the given index line</returns> private static Set <SynSet> GetSynSetShells(string wordIndexLine, POS pos, out SynSet mostCommonSynSet, WordNetEngine wordNetEngine) { Set <SynSet> synsets = new Set <SynSet>(); mostCommonSynSet = null; // get number of synsets string[] parts = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection SynSet synset = new SynSet(pos, offset, wordNetEngine); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) { mostCommonSynSet = synset; } } if (mostCommonSynSet == null) { throw new Exception("Failed to get most common synset"); } return(synsets); }
/// <summary> /// Gets the shortest path from the current synset to another, following the given synset relations. /// </summary> /// <param name="destination">Destination synset</param> /// <param name="relations">Relations to follow, or null for all relations.</param> /// <returns>Synset path, or null if none exists.</returns> public List <SynSet> GetShortestPathTo(SynSet destination, IEnumerable <WordNetEngine.SynSetRelation> relations) { if (relations == null) { relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)) as WordNetEngine.SynSetRelation[]; } // make sure the backpointer on the current synset is null - can't predict what other functions might do _searchBackPointer = null; // avoid cycles Set <SynSet> synsetsEncountered = new Set <SynSet>(); synsetsEncountered.Add(this); // start search queue Queue <SynSet> searchQueue = new Queue <SynSet>(); searchQueue.Enqueue(this); // run search List <SynSet> path = null; while (searchQueue.Count > 0 && path == null) { SynSet currSynSet = searchQueue.Dequeue(); // see if we've finished the search if (currSynSet == destination) { // gather synsets along path path = new List <SynSet>(); while (currSynSet != null) { path.Add(currSynSet); currSynSet = currSynSet.SearchBackPointer; } // reverse for the correct order path.Reverse(); } // expand the search one level else { foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false)) { if (!synsetsEncountered.Contains(synset)) { synset.SearchBackPointer = currSynSet; searchQueue.Enqueue(synset); synsetsEncountered.Add(synset); } } } } // null-out all search backpointers foreach (SynSet synset in synsetsEncountered) { synset.SearchBackPointer = null; } return(path); }
/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset) { // don't re-instantiate if (_instantiated) { throw new Exception("Synset has already been instantiated"); } /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */ int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1; if (lexicographerFileNumber <= 0) { throw new Exception("Invalid lexicographer file name number. Should be >= 1."); } _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber; // get number of words in the synset and the start character of the word list int wordStart; int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; // get words in synset _words = new List <string>(numWords); for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); if (Enumerable.Contains(word, ' ')) //ZK changed to static method { throw new Exception("Unexpected space in word: " + word); } _words.Add(word); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); if (Enumerable.Contains(_gloss, '|')) //ZK changed to static method { throw new Exception("Unexpected pipe in gloss"); } // get number and start of relations int relationCountField = 3 + (_words.Count * 2) + 1; int relationFieldStart; int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset _relationSynSets = new Dictionary <WordNetEngine.SynSetRelation, Set <SynSet> >(); _lexicalRelations = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <SynSet, Dictionary <int, Set <int> > > >(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) { relationSymbol = fieldValue; } // related synset offset else if (relationField == 1) { relatedSynSetOffset = int.Parse(fieldValue); } // related synset POS else if (relationField == 2) { relatedSynSetPOS = GetPOS(fieldValue); } // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else { throw new Exception(); } relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) { relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine); } // look up related synset directly else { relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; } // get relation WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { DictionaryExtensions.EnsureContainsKey(_relationSynSets, relation, typeof(Set <SynSet>)); //ZK changed to static method _relationSynSets[relation].Add(relatedSynSet); } // add lexical relation else { DictionaryExtensions.EnsureContainsKey(_lexicalRelations, relation, typeof(Dictionary <SynSet, Dictionary <int, Set <int> > >)); //ZK changed to static method DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation], relatedSynSet, typeof(Dictionary <int, Set <int> >)); //ZK changed to static method DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation][relatedSynSet], sourceWordIndex, typeof(Set <int>)); //ZK changed to static method if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex)) { _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex); } } } // release the wordnet engine if we have one...don't need it anymore if (_wordNetEngine != null) { _wordNetEngine = null; } _instantiated = true; }
/// <summary> /// Gets similarity of two strings using the most common synset for given string/pos pairs /// </summary> /// <param name="string1">First string</param> /// <param name="pos1">First POS</param> /// <param name="pos2">Second POS</param> /// <param name="string2">Second string</param> /// <param name="strategy">Similarity strategy to use</param> /// <param name="relations">Relations to use when computing similarity</param> /// <returns>Similarity</returns> public float GetSimilarity(string string1, WordNetEngine.POS pos1, string string2, WordNetEngine.POS pos2, Strategy strategy, params WordNetEngine.SynSetRelation[] relations) { float similarity = 0; if (strategy == Strategy.WuPalmer1994Average) { // get average similarity across all synsets int numScores = 0; foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) { foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { similarity += GetSimilarity(synset1, synset2, strategy, relations); ++numScores; } } if (numScores > 0) { similarity = similarity / (float)numScores; } } else if (strategy == Strategy.WuPalmer1994Maximum) { // get maximum similarity across all synsets foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) { foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { float currSim = GetSimilarity(synset1, synset2, strategy, relations); if (currSim > similarity) { similarity = currSim; } } } } else if (strategy == Strategy.WuPalmer1994Minimum) { // get minimum similarity across all synsets similarity = -1; foreach (SynSet synset1 in _wordNetEngine.GetSynSets(string1, pos1)) { foreach (SynSet synset2 in _wordNetEngine.GetSynSets(string2, pos2)) { float currSim = GetSimilarity(synset1, synset2, strategy, relations); if (similarity == -1 || currSim < similarity) { similarity = currSim; } } } // if we didn't find any synsets, similarity is zero if (similarity == -1) { similarity = 0; } } else if (strategy == Strategy.WuPalmer1994MostCommon) { // use most common synsets SynSet synset1 = _wordNetEngine.GetMostCommonSynSet(string1, pos1); SynSet synset2 = _wordNetEngine.GetMostCommonSynSet(string2, pos2); if (synset1 != null && synset2 != null) { similarity = GetSimilarity(synset1, synset2, strategy, relations); } } else { throw new NotImplementedException("Unimplemented strategy: " + strategy); } if (similarity < 0 || similarity > 1) { throw new Exception("Invalid similarity: " + similarity); } return(similarity); }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) { throw new DirectoryNotFoundException("Non-existent WordNet directory: " + _wordNetDirectory); } // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in Enumerable.Union(dataPaths, indexPaths)) //ZK change to static method { if (!System.IO.File.Exists(path)) { throw new FileNotFoundException("Failed to find WordNet file: " + path); } } #region index file sorting string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net"); if (!System.IO.File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); StreamWriter tempFile = new StreamWriter(tempPath); // get number of words (lines) in file int numWords = 0; StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { if (!line.StartsWith(" ")) { ++numWords; } } // get lines in file, sorted by first column (i.e., the word) Dictionary <string, string> wordLine = new Dictionary <string, string>(numWords); indexFile = new StreamReader(indexPath); while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method // write header lines to temp file immediately { if (line.StartsWith(" ")) { tempFile.WriteLine(line); } else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } } // get sorted words List <string> sortedWords = new List <string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) { tempFile.WriteLine(wordLine[word]); } tempFile.Close(); // replace original index file with properly sorted one System.IO.File.Delete(indexPath); System.IO.File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data StreamWriter sortFlagFile = new StreamWriter(sortFlagPath); sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } #endregion #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary <string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets _posWordSynSets = new Dictionary <POS, Dictionary <string, Set <SynSet> > >(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); DictionaryExtensions.EnsureContainsKey(_posWordSynSets, pos, typeof(Dictionary <string, Set <SynSet> >)); //ZK change to static method // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; Set <SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) { _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); } // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new Set <SynSet>(synsets.Count)); foreach (SynSet synset in synsets) { _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') { return(1); } // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return(((string)searchWord).CompareTo(currentWord)); })); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary <POS, StreamReader>(); foreach (string dataPath in dataPaths) { _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } } #endregion }