/// <summary> /// Gets lexically related words for the current synset. Many of the relations in WordNet are lexical instead of semantic. Whereas /// the latter indicate relations between entire synsets (e.g., hypernym), the former indicate relations between specific /// words in synsets. This method retrieves all lexical relations and the words related thereby. /// </summary> /// <returns>Mapping from relations to mappings from words in the current synset to related words in the related synsets</returns> public Dictionary <WordNetEngine.SynSetRelation, Dictionary <string, Set <string> > > GetLexicallyRelatedWords() { Dictionary <WordNetEngine.SynSetRelation, Dictionary <string, Set <string> > > relatedWords = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <string, Set <string> > >(); foreach (WordNetEngine.SynSetRelation relation in _lexicalRelations.Keys) { DictionaryExtensions.EnsureContainsKey(relatedWords, relation, typeof(Dictionary <string, Set <string> >)); //ZK changed to static method foreach (SynSet relatedSynSet in _lexicalRelations[relation].Keys) { // make sure related synset is initialized if (!relatedSynSet.Instantiated) { relatedSynSet.Instantiate(); } foreach (int sourceWordIndex in _lexicalRelations[relation][relatedSynSet].Keys) { string sourceWord = _words[sourceWordIndex - 1]; DictionaryExtensions.EnsureContainsKey(relatedWords[relation], sourceWord, typeof(Set <string>), false); //ZK changed to static method foreach (int targetWordIndex in _lexicalRelations[relation][relatedSynSet][sourceWordIndex]) { string targetWord = relatedSynSet.Words[targetWordIndex - 1]; relatedWords[relation][sourceWord].Add(targetWord); } } } } return(relatedWords); }
/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset) { // don't re-instantiate if (_instantiated) { throw new Exception("Synset has already been instantiated"); } /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */ int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1; if (lexicographerFileNumber <= 0) { throw new Exception("Invalid lexicographer file name number. Should be >= 1."); } _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber; // get number of words in the synset and the start character of the word list int wordStart; int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; // get words in synset _words = new List <string>(numWords); for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); if (Enumerable.Contains(word, ' ')) //ZK changed to static method { throw new Exception("Unexpected space in word: " + word); } _words.Add(word); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); if (Enumerable.Contains(_gloss, '|')) //ZK changed to static method { throw new Exception("Unexpected pipe in gloss"); } // get number and start of relations int relationCountField = 3 + (_words.Count * 2) + 1; int relationFieldStart; int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset _relationSynSets = new Dictionary <WordNetEngine.SynSetRelation, Set <SynSet> >(); _lexicalRelations = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <SynSet, Dictionary <int, Set <int> > > >(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) { relationSymbol = fieldValue; } // related synset offset else if (relationField == 1) { relatedSynSetOffset = int.Parse(fieldValue); } // related synset POS else if (relationField == 2) { relatedSynSetPOS = GetPOS(fieldValue); } // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else { throw new Exception(); } relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) { relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine); } // look up related synset directly else { relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; } // get relation WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { DictionaryExtensions.EnsureContainsKey(_relationSynSets, relation, typeof(Set <SynSet>)); //ZK changed to static method _relationSynSets[relation].Add(relatedSynSet); } // add lexical relation else { DictionaryExtensions.EnsureContainsKey(_lexicalRelations, relation, typeof(Dictionary <SynSet, Dictionary <int, Set <int> > >)); //ZK changed to static method DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation], relatedSynSet, typeof(Dictionary <int, Set <int> >)); //ZK changed to static method DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation][relatedSynSet], sourceWordIndex, typeof(Set <int>)); //ZK changed to static method if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex)) { _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex); } } } // release the wordnet engine if we have one...don't need it anymore if (_wordNetEngine != null) { _wordNetEngine = null; } _instantiated = true; }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) { throw new DirectoryNotFoundException("Non-existent WordNet directory: " + _wordNetDirectory); } // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in Enumerable.Union(dataPaths, indexPaths)) //ZK change to static method { if (!System.IO.File.Exists(path)) { throw new FileNotFoundException("Failed to find WordNet file: " + path); } } #region index file sorting string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net"); if (!System.IO.File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); StreamWriter tempFile = new StreamWriter(tempPath); // get number of words (lines) in file int numWords = 0; StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { if (!line.StartsWith(" ")) { ++numWords; } } // get lines in file, sorted by first column (i.e., the word) Dictionary <string, string> wordLine = new Dictionary <string, string>(numWords); indexFile = new StreamReader(indexPath); while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method // write header lines to temp file immediately { if (line.StartsWith(" ")) { tempFile.WriteLine(line); } else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } } // get sorted words List <string> sortedWords = new List <string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) { tempFile.WriteLine(wordLine[word]); } tempFile.Close(); // replace original index file with properly sorted one System.IO.File.Delete(indexPath); System.IO.File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data StreamWriter sortFlagFile = new StreamWriter(sortFlagPath); sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } #endregion #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary <string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets _posWordSynSets = new Dictionary <POS, Dictionary <string, Set <SynSet> > >(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); DictionaryExtensions.EnsureContainsKey(_posWordSynSets, pos, typeof(Dictionary <string, Set <SynSet> >)); //ZK change to static method // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; Set <SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) { _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); } // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new Set <SynSet>(synsets.Count)); foreach (SynSet synset in synsets) { _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') { return(1); } // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return(((string)searchWord).CompareTo(currentWord)); })); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary <POS, StreamReader>(); foreach (string dataPath in dataPaths) { _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } } #endregion }