/// <summary> /// Gets a synset /// </summary> /// <param name="synsetID">ID of synset in the format returned by SynSet.ID (i.e., POS:Offset)</param> /// <returns>SynSet</returns> public SynSet GetSynSet(string synsetID) { SynSet synset; if (_inMemory) { synset = _idSynset[synsetID]; } else { // get POS and offset int colonLoc = synsetID.IndexOf(':'); POS pos = (POS)Enum.Parse(typeof(POS), synsetID.Substring(0, colonLoc)); int offset = int.Parse(synsetID.Substring(colonLoc + 1)); // create shell and then instantiate synset = new SynSet(pos, offset, this); synset.Instantiate(); } return(synset); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param> /// <returns>Synset shells for the given index line</returns> private static List <SynSet> GetSynSetShells(string wordIndexLine, POS pos, WordNetEngine wordNetEngine) { List <SynSet> synsets = new List <SynSet>(); // get number of synsets string[] parts = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection SynSet synset = new SynSet(pos, offset, wordNetEngine); synsets.Add(synset); } return(synsets); }
/// <summary> /// Gets the shortest path from the current synset to another, following the given synset relations. /// </summary> /// <param name="destination">Destination synset</param> /// <param name="relations">Relations to follow, or null for all relations.</param> /// <returns>Synset path, or null if none exists.</returns> public List <SynSet> GetShortestPathTo(SynSet destination, IEnumerable <WordNetEngine.SynSetRelation> relations) { if (relations == null) { relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)) as WordNetEngine.SynSetRelation[]; } // make sure the backpointer on the current synset is null - can't predict what other functions might do _searchBackPointer = null; // avoid cycles List <SynSet> synsetsEncountered = new List <SynSet>(); synsetsEncountered.Add(this); // start search queue Queue <SynSet> searchQueue = new Queue <SynSet>(); searchQueue.Enqueue(this); // run search List <SynSet> path = null; while (searchQueue.Count > 0 && path == null) { SynSet currSynSet = searchQueue.Dequeue(); // see if we've finished the search if (currSynSet == destination) { // gather synsets along path path = new List <SynSet>(); while (currSynSet != null) { path.Add(currSynSet); currSynSet = currSynSet.SearchBackPointer; } // reverse for the correct order path.Reverse(); } // expand the search one level else { foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false)) { if (!synsetsEncountered.Contains(synset)) { synset.SearchBackPointer = currSynSet; searchQueue.Enqueue(synset); synsetsEncountered.Add(synset); } } } } // null-out all search backpointers foreach (SynSet synset in synsetsEncountered) { synset.SearchBackPointer = null; } return(path); }
/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset) { // don't re-instantiate if (!_instantiated) { // get number of words in the synset and the start character of the word list int wordStart; int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; _words = new List <string>(numWords); // get words in synset for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); _words.Add(word); // get lex_id lex_id = Convert.ToInt32(definition.Substring(definition.IndexOf(' ') + 1, 2)); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); // get number and start of relations int relationCountField = 3 + (_words.Count * 2) + 1; int relationFieldStart; int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset _relationSynSets = new Dictionary <WordNetEngine.SynSetRelation, List <SynSet> >(); _lexicalRelations = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <SynSet, Dictionary <int, List <int> > > >(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) { relationSymbol = fieldValue; } // related synset offset else if (relationField == 1) { relatedSynSetOffset = int.Parse(fieldValue); } // related synset POS else if (relationField == 2) { relatedSynSetPOS = GetPOS(fieldValue); } // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else { throw new Exception(); } relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) { relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine); } // look up related synset directly else { relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; } // get relation WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { _relationSynSets.EnsureContainsKey(relation, typeof(List <SynSet>)); _relationSynSets[relation].Add(relatedSynSet); } // add lexical relation else { _lexicalRelations.EnsureContainsKey(relation, typeof(Dictionary <SynSet, Dictionary <int, List <int> > >)); _lexicalRelations[relation].EnsureContainsKey(relatedSynSet, typeof(Dictionary <int, List <int> >)); _lexicalRelations[relation][relatedSynSet].EnsureContainsKey(sourceWordIndex, typeof(List <int>)); if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex)) { _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex); } } } _instantiated = true; } // release the wordnet engine if we have one...don't need it anymore if (_wordNetEngine != null) { _wordNetEngine = null; } }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) { throw new DirectoryNotFoundException("Error 502"); } // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in dataPaths.Union(indexPaths)) { if (!System.IO.File.Exists(path)) { throw new FileNotFoundException("Error 502"); } } // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // * * // * UPDATE [HASSAN:11/03/2017]: The lemmatizer requires except- * // * tion dictionary for each POS to be loaded as stream * // * * // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * LemmaExcptionsFile = new Dictionary <string, StreamReader>(4); LemmaExcptionsFile.Add("noun", new StreamReader(wordNetDirectory + "\\noun.exc")); LemmaExcptionsFile.Add("verb", new StreamReader(wordNetDirectory + "\\verb.exc")); // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // * * // * UPDATE [HASSAN:11/07/2017]: The lemmatizer requires except- * // * tion dictionary for noun only in the context of SemCluster * // * tool. In order to implement lemmatizer for all 4-POS tags * // * you will need the following: * // * 1) Uncomment the following lines. * // * 2) Uncomment the lines in suffixMap variable. * // * 3) Uncomment the GetSynsets Switch section * // * 4) Add Exception files for each POS in the data folder * // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //LemmaExcptionsFile.Add("adjective", new StreamReader(wordNetDirectory + "\\adj.exc")); //LemmaExcptionsFile.Add("adverb", new StreamReader(wordNetDirectory + "\\adv.exc")); // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // * * // * UPDATE [HASSAN:28/01/2016]: The #region index file sorting * // * has been removed here,since its required to run only for * // * first program execution * // * * // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary <string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } } // organize synsets by pos and words _posWordSynSets = new Dictionary <POS, Dictionary <string, List <SynSet> > >(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary <string, List <SynSet> >)); // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (indexFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells string word = line.Substring(0, firstSpace); List <SynSet> synsets = GetSynSetShells(line, pos, null); // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new List <SynSet>(synsets.Count)); foreach (SynSet synset in synsets) { _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate( delegate(string searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') { return(1); } // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return(((string)searchWord).CompareTo(currentWord)); } )); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary <POS, StreamReader>(); foreach (string dataPath in dataPaths) { _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } } #endregion }