public TestForm() { InitializeComponent(); // create wordnet engine _wordNetEngine = new WordNetEngine(@"WordNet", true); if (!_wordNetEngine.InMemory) test.Text += " (will take a while)"; // populate POS list foreach (WordNetEngine.POS p in Enum.GetValues(typeof(WordNetEngine.POS))) if (p != WordNetEngine.POS.None) pos.Items.Add(p); pos.SelectedIndex = 0; // allow scrolling of synset list synSets.HorizontalScrollbar = true; _semSimSs1 = _semSimSs2 = null; _origSsLbl = ss1.Text; _semanticSimilarityModel = new WordNetSimilarityModel(_wordNetEngine); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param> /// <returns>Synset shells for the given index line</returns> private static Set<SynSet> GetSynSetShells(string wordIndexLine, POS pos, out SynSet mostCommonSynSet, WordNetEngine wordNetEngine) { Set<SynSet> synsets = new Set<SynSet>(); mostCommonSynSet = null; // get number of synsets string[] parts = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection SynSet synset = new SynSet(pos, offset, wordNetEngine); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) mostCommonSynSet = synset; } if (mostCommonSynSet == null) throw new Exception("Failed to get most common synset"); return synsets; }
/// <summary> /// Gets a synset /// </summary> /// <param name="synsetID">ID of synset in the format returned by SynSet.ID (i.e., POS:Offset)</param> /// <returns>SynSet</returns> public SynSet GetSynSet(string synsetID) { SynSet synset; if (_inMemory) synset = _idSynset[synsetID]; else { // get POS and offset int colonLoc = synsetID.IndexOf(':'); POS pos = (POS)Enum.Parse(typeof(POS), synsetID.Substring(0, colonLoc)); int offset = int.Parse(synsetID.Substring(colonLoc + 1)); // create shell and then instantiate synset = new SynSet(pos, offset, this); synset.Instantiate(); } return synset; }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) throw new DirectoryNotFoundException("Non-existent WordNet directory: " + _wordNetDirectory); // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in dataPaths.Union(indexPaths)) if (!System.IO.File.Exists(path)) throw new FileNotFoundException("Failed to find WordNet file: " + path); #region index file sorting string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net"); if (!System.IO.File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); StreamWriter tempFile = new StreamWriter(tempPath); // get number of words (lines) in file int numWords = 0; StreamReader indexFile = new StreamReader(indexPath); string line; while (indexFile.TryReadLine(out line)) if (!line.StartsWith(" ")) ++numWords; // get lines in file, sorted by first column (i.e., the word) Dictionary<string, string> wordLine = new Dictionary<string, string>(numWords); indexFile = new StreamReader(indexPath); while (indexFile.TryReadLine(out line)) // write header lines to temp file immediately if (line.StartsWith(" ")) tempFile.WriteLine(line); else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } // get sorted words List<string> sortedWords = new List<string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) tempFile.WriteLine(wordLine[word]); tempFile.Close(); // replace original index file with properly sorted one System.IO.File.Delete(indexPath); System.IO.File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data StreamWriter sortFlagFile = new StreamWriter(sortFlagPath); sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } #endregion #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) ++totalSynsets; } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary<string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets _posWordSynSets = new Dictionary<POS, Dictionary<string, Set<SynSet>>>(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary<string, Set<SynSet>>)); // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (indexFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; Set<SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new Set<SynSet>(synsets.Count)); foreach (SynSet synset in synsets) _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary<POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') return 1; // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return ((string)searchWord).CompareTo(currentWord); })); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary<POS, StreamReader>(); foreach (string dataPath in dataPaths) _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } #endregion }
/// <summary> /// Gets similarity of two synsets /// </summary> /// <param name="synset1">First synset</param> /// <param name="synset2">Second synset</param> /// <param name="strategy">Strategy to use. All strategies named WuPalmer1994* will produce the same result since only two synsets /// are available.</param> /// <param name="relations">Synset relations to follow when computing similarity (pass null for all relations)</param> /// <returns>Similarity</returns> public float GetSimilarity(SynSet synset1, SynSet synset2, Strategy strategy, params WordNetEngine.SynSetRelation[] relations) { if (relations == null) relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)).Cast<WordNetEngine.SynSetRelation>().ToArray(); float similarity = 0; if (strategy.ToString().StartsWith("WuPalmer1994")) { // get the LCS along the similarity relations SynSet lcs = synset1.GetClosestMutuallyReachableSynset(synset2, relations); if (lcs == null) similarity = 0; else { // get depth of synsets int lcsDepth = lcs.GetDepth(relations) + 1; int synset1Depth = synset1.GetShortestPathTo(lcs, relations).Count - 1 + lcsDepth; int synset2Depth = synset2.GetShortestPathTo(lcs, relations).Count - 1 + lcsDepth; // get similarity similarity = 2 * lcsDepth / (float)(synset1Depth + synset2Depth); } } else throw new NotImplementedException("Unrecognized strategy"); if (similarity < 0 || similarity > 1) throw new Exception("Invalid similarity"); return similarity; }
/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary<string, SynSet> idSynset) { // don't re-instantiate if (_instantiated) throw new Exception("Synset has already been instantiated"); /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */ int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1; if (lexicographerFileNumber <= 0) throw new Exception("Invalid lexicographer file name number. Should be >= 1."); _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber; // get number of words in the synset and the start character of the word list int wordStart; int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; // get words in synset _words = new List<string>(numWords); for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); if (word.Contains(' ')) throw new Exception("Unexpected space in word: " + word); _words.Add(word); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); if (_gloss.Contains('|')) throw new Exception("Unexpected pipe in gloss"); // get number and start of relations int relationCountField = 3 + (_words.Count * 2) + 1; int relationFieldStart; int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset _relationSynSets = new Dictionary<WordNetEngine.SynSetRelation, Set<SynSet>>(); _lexicalRelations = new Dictionary<WordNetEngine.SynSetRelation, Dictionary<SynSet, Dictionary<int, Set<int>>>>(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) relationSymbol = fieldValue; // related synset offset else if (relationField == 1) relatedSynSetOffset = int.Parse(fieldValue); // related synset POS else if (relationField == 2) relatedSynSetPOS = GetPOS(fieldValue); // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else throw new Exception(); relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine); // look up related synset directly else relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; // get relation WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { _relationSynSets.EnsureContainsKey(relation, typeof(Set<SynSet>)); _relationSynSets[relation].Add(relatedSynSet); } // add lexical relation else { _lexicalRelations.EnsureContainsKey(relation, typeof(Dictionary<SynSet, Dictionary<int, Set<int>>>)); _lexicalRelations[relation].EnsureContainsKey(relatedSynSet, typeof(Dictionary<int, Set<int>>)); _lexicalRelations[relation][relatedSynSet].EnsureContainsKey(sourceWordIndex, typeof(Set<int>)); if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex)) _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex); } } // release the wordnet engine if we have one...don't need it anymore if (_wordNetEngine != null) _wordNetEngine = null; _instantiated = true; }
/// <summary> /// Gets the shortest path from the current synset to another, following the given synset relations. /// </summary> /// <param name="destination">Destination synset</param> /// <param name="relations">Relations to follow, or null for all relations.</param> /// <returns>Synset path, or null if none exists.</returns> public List<SynSet> GetShortestPathTo(SynSet destination, IEnumerable<WordNetEngine.SynSetRelation> relations) { if (relations == null) relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)) as WordNetEngine.SynSetRelation[]; // make sure the backpointer on the current synset is null - can't predict what other functions might do _searchBackPointer = null; // avoid cycles Set<SynSet> synsetsEncountered = new Set<SynSet>(); synsetsEncountered.Add(this); // start search queue Queue<SynSet> searchQueue = new Queue<SynSet>(); searchQueue.Enqueue(this); // run search List<SynSet> path = null; while (searchQueue.Count > 0 && path == null) { SynSet currSynSet = searchQueue.Dequeue(); // see if we've finished the search if (currSynSet == destination) { // gather synsets along path path = new List<SynSet>(); while (currSynSet != null) { path.Add(currSynSet); currSynSet = currSynSet.SearchBackPointer; } // reverse for the correct order path.Reverse(); } // expand the search one level else foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false)) if (!synsetsEncountered.Contains(synset)) { synset.SearchBackPointer = currSynSet; searchQueue.Enqueue(synset); synsetsEncountered.Add(synset); } } // null-out all search backpointers foreach (SynSet synset in synsetsEncountered) synset.SearchBackPointer = null; return path; }
/// <summary> /// Gets the closest synset that is reachable from the current and another synset along the given relations. For example, /// given two synsets and the Hypernym relation, this will return the lowest synset that is a hypernym of both synsets. If /// the hypernym hierarchy forms a tree, this will be the lowest common ancestor. /// </summary> /// <param name="synset">Other synset</param> /// <param name="relations">Relations to follow</param> /// <returns>Closest mutually reachable synset</returns> public SynSet GetClosestMutuallyReachableSynset(SynSet synset, IEnumerable<WordNetEngine.SynSetRelation> relations) { // avoid cycles Set<SynSet> synsetsEncountered = new Set<SynSet>(); synsetsEncountered.Add(this); // start search queue Queue<SynSet> searchQueue = new Queue<SynSet>(); searchQueue.Enqueue(this); // run search SynSet closest = null; while (searchQueue.Count > 0 && closest == null) { SynSet currSynSet = searchQueue.Dequeue(); /* check for a path between the given synset and the current one. if such a path exists, the current * synset is the closest mutually reachable synset. */ if (synset.GetShortestPathTo(currSynSet, relations) != null) closest = currSynSet; // otherwise, expand the search along the given relations else foreach (SynSet relatedSynset in currSynSet.GetRelatedSynSets(relations, false)) if (!synsetsEncountered.Contains(relatedSynset)) { searchQueue.Enqueue(relatedSynset); synsetsEncountered.Add(relatedSynset); } } return closest; }
private void synSets_MouseDoubleClick(object sender, MouseEventArgs e) { if (synSets.SelectedItem == null) return; SynSet s = (SynSet)synSets.SelectedItem; if (_semSimSs1 == null) { _semSimSs1 = s; ss1.Text = _semSimSs1.ToString() + " (double-click to remove)"; } else if (_semSimSs2 == null) { _semSimSs2 = s; ss2.Text = _semSimSs2.ToString() + " (double-click to remove)"; } else MessageBox.Show("Please remove one of the synsets selected below (double-click it)"); computeSemSim.Enabled = _semSimSs1 != null && _semSimSs2 != null; }
private void ss2_DoubleClick(object sender, EventArgs e) { ss2.Text = _origSsLbl; _semSimSs2 = null; computeSemSim.Enabled = false; }