/// <summary> /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from /// idSynset; otherwise, related synsets are created as shells. /// </summary> /// <param name="definition">Definition line of synset from data file</param> /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param> internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset) { // don't re-instantiate if (Instantiated) { throw new Exception("Synset has already been instantiated"); } /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */ int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1; if (lexicographerFileNumber <= 0) { throw new Exception("Invalid lexicographer file name number. Should be >= 1."); } LexicographerFileName = (LexicographerFileName)lexicographerFileNumber; // get number of words in the synset and the start character of the word list int numWords = int.Parse(GetField(definition, 3, out int wordStart), NumberStyles.HexNumber); wordStart = definition.IndexOf(' ', wordStart) + 1; // get words in synset Words = new List <string>(numWords); for (int i = 0; i < numWords; ++i) { int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1; int wordLen = wordEnd - wordStart + 1; string word = definition.Substring(wordStart, wordLen); if (word.Contains(' ')) { throw new Exception("Unexpected space in word: " + word); } Words.Add(word); // skip lex_id field wordStart = definition.IndexOf(' ', wordEnd + 2) + 1; } // get gloss Gloss = definition.Substring(definition.IndexOf('|') + 1).Trim(); if (Gloss.Contains('|')) { throw new Exception("Unexpected pipe in gloss"); } // get number and start of relations int relationCountField = 3 + Words.Count * 2 + 1; int numRelations = int.Parse(GetField(definition, relationCountField, out int relationFieldStart)); relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1; // grab each related synset relationSynSets = new Dictionary <SynSetRelation, List <SynSet> >(); lexicalRelations = new Dictionary <SynSetRelation, Dictionary <SynSet, Dictionary <int, List <int> > > >(); for (int relationNum = 0; relationNum < numRelations; ++relationNum) { string relationSymbol = null; int relatedSynSetOffset = -1; WordType relatedSynSetPOS = WordType.Unknown; int sourceWordIndex = -1; int targetWordIndex = -1; // each relation has four columns for (int relationField = 0; relationField <= 3; ++relationField) { int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1; int fieldLen = fieldEnd - relationFieldStart + 1; string fieldValue = definition.Substring(relationFieldStart, fieldLen); // relation symbol if (relationField == 0) { relationSymbol = fieldValue; } // related synset offset else if (relationField == 1) { relatedSynSetOffset = int.Parse(fieldValue); } // related synset POS else if (relationField == 2) { relatedSynSetPOS = GetPOS(fieldValue); } // source/target word for lexical relation else if (relationField == 3) { sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber); targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber); } else { throw new Exception(); } relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1; } // get related synset...create shell if we don't have a lookup SynSet relatedSynSet; if (idSynset == null) { relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset); } // look up related synset directly else { relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset]; } // get relation SynSetRelation relation = WordNetEngine.GetSynSetRelation(POS, relationSymbol); // add semantic relation if we have neither a source nor a target word index if (sourceWordIndex == 0 && targetWordIndex == 0) { var list = relationSynSets.GetItemCreate(relation); list.Add(relatedSynSet); } // add lexical relation else { var itemRelation = lexicalRelations.GetItemCreate(relation); var itemRelatedSynSet = itemRelation.GetItemCreate(relatedSynSet); var itemSourceWordIndex = itemRelatedSynSet.GetItemCreate(sourceWordIndex); if (!itemSourceWordIndex.Contains(targetWordIndex)) { itemSourceWordIndex.Add(targetWordIndex); } } } Instantiated = true; }
private void Load() { if (!Directory.Exists(WordNetDirectory)) { throw new DirectoryNotFoundException("Non-existent WordNet directory: " + WordNetDirectory); } // get data and index paths string[] dataPaths = { Path.Combine(WordNetDirectory, "data.adj"), Path.Combine(WordNetDirectory, "data.adv"), Path.Combine(WordNetDirectory, "data.noun"), Path.Combine(WordNetDirectory, "data.verb") }; string[] indexPaths = new[] { Path.Combine(WordNetDirectory, "index.adj"), Path.Combine(WordNetDirectory, "index.adv"), Path.Combine(WordNetDirectory, "index.noun"), Path.Combine(WordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in dataPaths.Union(indexPaths)) { if (!File.Exists(path)) { throw new FileNotFoundException("Failed to find WordNet file: " + path); } } string sortFlagPath = Path.Combine(WordNetDirectory, ".sorted_for_dot_net"); if (!File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); using (StreamWriter tempFile = new StreamWriter(tempPath)) { // get number of words (lines) in file int numWords = 0; using (TextReader indexFile = new StreamReader(indexPath)) { string line; while ((line = indexFile.ReadLine()) != null) { if (!line.StartsWith(" ")) { ++numWords; } } } // get lines in file, sorted by first column (i.e., the word) Dictionary <string, string> wordLine = new Dictionary <string, string>(numWords, StringComparer.OrdinalIgnoreCase); using (StreamReader indexFile = new StreamReader(indexPath)) { string line; while ((line = indexFile.ReadLine()) != null) { // write header lines to temp file immediately if (line.StartsWith(" ")) { tempFile.WriteLine(line); } else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } } } // get sorted words List <string> sortedWords = new List <string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) { tempFile.WriteLine(wordLine[word]); } tempFile.Close(); } // replace original index file with properly sorted one File.Delete(indexPath); File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data using (StreamWriter sortFlagFile = new StreamWriter(sortFlagPath)) { sortFlagFile.WriteLine( "This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } } // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines using (StreamReader dataFile = new StreamReader(dataPath)) { string line; while ((line = dataFile.ReadLine()) != null) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } } // pass 2: create synset shells (pos and offset only) idSynset = new Dictionary <string, SynSet>(totalSynsets, StringComparer.OrdinalIgnoreCase); foreach (string dataPath in dataPaths) { WordType pos = GetFilePOS(dataPath); // scan synset data file using (StreamReader dataFile = new StreamReader(dataPath)) { string line; while ((line = dataFile.ReadLine()) != null) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset); idSynset.Add(synset.ID, synset); } } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { WordType pos = GetFilePOS(dataPath); // scan synset data file using (StreamReader dataFile = new StreamReader(dataPath)) { string line; while ((line = dataFile.ReadLine()) != null) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, idSynset); } } } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets posWordSynSets = new Dictionary <WordType, Dictionary <string, List <SynSet> > >(); foreach (string indexPath in indexPaths) { WordType pos = GetFilePOS(indexPath); posWordSynSets.GetItemCreate(pos); // scan word index file, skipping header lines using (StreamReader indexFile = new StreamReader(indexPath)) { string line; while ((line = indexFile.ReadLine()) != null) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); List <SynSet> synsets = GetSynSetShells(line, pos, out SynSet mostCommonSynSet); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) { idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); } // use reference to the synsets that we instantiated in our three-pass routine above posWordSynSets[pos].Add(word, new List <SynSet>(synsets.Count)); foreach (SynSet synset in synsets) { posWordSynSets[pos][word].Add(idSynset[synset.ID]); } } } } } }
/// <summary> /// Gets the shortest path from the current synset to another, following the given synset relations. /// </summary> /// <param name="destination">Destination synset</param> /// <param name="relations">Relations to follow, or null for all relations.</param> /// <returns>Synset path, or null if none exists.</returns> public List <SynSet> GetShortestPathTo(SynSet destination, IEnumerable <SynSetRelation> relations) { if (relations == null) { relations = Enum.GetValues(typeof(SynSetRelation)) as SynSetRelation[]; } // make sure the backpointer on the current synset is null - can't predict what other functions might do SearchBackPointer = null; // avoid cycles List <SynSet> synsetsEncountered = new List <SynSet>(); synsetsEncountered.Add(this); // start search queue Queue <SynSet> searchQueue = new Queue <SynSet>(); searchQueue.Enqueue(this); // run search List <SynSet> path = null; while (searchQueue.Count > 0 && path == null) { SynSet currSynSet = searchQueue.Dequeue(); // see if we've finished the search if (currSynSet == destination) { // gather synsets along path path = new List <SynSet>(); while (currSynSet != null) { path.Add(currSynSet); currSynSet = currSynSet.SearchBackPointer; } // reverse for the correct order path.Reverse(); } // expand the search one level else { foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false)) { if (!synsetsEncountered.Contains(synset)) { synset.SearchBackPointer = currSynSet; searchQueue.Enqueue(synset); synsetsEncountered.Add(synset); } } } } // null-out all search backpointers foreach (SynSet synset in synsetsEncountered) { synset.SearchBackPointer = null; } return(path); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <returns>Synset shells for the given index line</returns> private static List <SynSet> GetSynSetShells(string wordIndexLine, WordType pos, out SynSet mostCommonSynSet) { List <SynSet> synsets = new List <SynSet>(); mostCommonSynSet = null; // get number of synsets string[] parts = wordIndexLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection SynSet synset = new SynSet(pos, offset); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) { mostCommonSynSet = synset; } } if (mostCommonSynSet == null) { throw new Exception("Failed to get most common synset"); } return(synsets); }