Exemple #1
0
        public TestForm()
        {
            InitializeComponent();

            // create wordnet engine
            _wordNetEngine = new WordNetEngine(@"WordNet", true);

            if (!_wordNetEngine.InMemory)
                test.Text += " (will take a while)";

            // populate POS list
            foreach (WordNetEngine.POS p in Enum.GetValues(typeof(WordNetEngine.POS)))
                if (p != WordNetEngine.POS.None)
                    pos.Items.Add(p);

            pos.SelectedIndex = 0;

            // allow scrolling of synset list
            synSets.HorizontalScrollbar = true;

            _semSimSs1 = _semSimSs2 = null;
            _origSsLbl = ss1.Text;
            _semanticSimilarityModel = new WordNetSimilarityModel(_wordNetEngine);
        }
        /// <summary>
        /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset
        /// members initialized. These members are enough to look up the full synset within the corresponding data file. This
        /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the 
        /// corresponding parameter.
        /// </summary>
        /// <param name="wordIndexLine">Word index line from which to get synset shells</param>
        /// <param name="pos">POS of the given index line</param>
        /// <param name="mostCommonSynSet">Returns the most common synset for the word</param>
        /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param>
        /// <returns>Synset shells for the given index line</returns>
        private static Set<SynSet> GetSynSetShells(string wordIndexLine, POS pos, out SynSet mostCommonSynSet, WordNetEngine wordNetEngine)
        {
            Set<SynSet> synsets = new Set<SynSet>();
            mostCommonSynSet = null;

            // get number of synsets
            string[] parts = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int numSynSets = int.Parse(parts[2]);

            // grab each synset shell, from last to first
            int firstOffsetIndex = parts.Length - numSynSets;
            for (int i = parts.Length - 1; i >= firstOffsetIndex; --i)
            {
                // create synset
                int offset = int.Parse(parts[i]);

                // add synset to collection
                SynSet synset = new SynSet(pos, offset, wordNetEngine);
                synsets.Add(synset);

                // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset
                if (i == firstOffsetIndex)
                    mostCommonSynSet = synset;
            }

            if (mostCommonSynSet == null)
                throw new Exception("Failed to get most common synset");

            return synsets;
        }
        /// <summary>
        /// Gets a synset
        /// </summary>
        /// <param name="synsetID">ID of synset in the format returned by SynSet.ID (i.e., POS:Offset)</param>
        /// <returns>SynSet</returns>
        public SynSet GetSynSet(string synsetID)
        {
            SynSet synset;
            if (_inMemory)
                synset = _idSynset[synsetID];
            else
            {
                // get POS and offset
                int colonLoc = synsetID.IndexOf(':');
                POS pos = (POS)Enum.Parse(typeof(POS), synsetID.Substring(0, colonLoc));
                int offset = int.Parse(synsetID.Substring(colonLoc + 1));

                // create shell and then instantiate
                synset = new SynSet(pos, offset, this);
                synset.Instantiate();
            }

            return synset;
        }
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param>
        /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space
        /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient
        /// binary search algorithm.</param>
        public WordNetEngine(string wordNetDirectory, bool inMemory)
        {
            _wordNetDirectory = wordNetDirectory;
            _inMemory = inMemory;
            _posIndexWordSearchStream = null;
            _posSynSetDataFile = null;

            if (!System.IO.Directory.Exists(_wordNetDirectory))
                throw new DirectoryNotFoundException("Non-existent WordNet directory:  " + _wordNetDirectory);

            // get data and index paths
            string[] dataPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "data.adj"),
                Path.Combine(_wordNetDirectory, "data.adv"),
                Path.Combine(_wordNetDirectory, "data.noun"),
                Path.Combine(_wordNetDirectory, "data.verb")
            };

            string[] indexPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "index.adj"),
                Path.Combine(_wordNetDirectory, "index.adv"),
                Path.Combine(_wordNetDirectory, "index.noun"),
                Path.Combine(_wordNetDirectory, "index.verb")
            };

            // make sure all files exist
            foreach (string path in dataPaths.Union(indexPaths))
                if (!System.IO.File.Exists(path))
                    throw new FileNotFoundException("Failed to find WordNet file:  " + path);

            #region index file sorting
            string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net");
            if (!System.IO.File.Exists(sortFlagPath))
            {
                /* make sure the index files are sorted according to the current sort order. the index files in the
                 * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses
                 * a different sort order than the .net runtime. thus, unless we resort the lines in the index
                 * files, we won't be able to do a proper binary search over the data. */
                foreach (string indexPath in indexPaths)
                {
                    // create temporary file for sorted lines
                    string tempPath = Path.GetTempFileName();
                    StreamWriter tempFile = new StreamWriter(tempPath);

                    // get number of words (lines) in file
                    int numWords = 0;
                    StreamReader indexFile = new StreamReader(indexPath);
                    string line;
                    while (indexFile.TryReadLine(out line))
                        if (!line.StartsWith(" "))
                            ++numWords;

                    // get lines in file, sorted by first column (i.e., the word)
                    Dictionary<string, string> wordLine = new Dictionary<string, string>(numWords);
                    indexFile = new StreamReader(indexPath);
                    while (indexFile.TryReadLine(out line))
                        // write header lines to temp file immediately
                        if (line.StartsWith(" "))
                            tempFile.WriteLine(line);
                        else
                        {
                            // trim useless blank spaces from line and map line to first column
                            line = line.Trim();
                            wordLine.Add(line.Substring(0, line.IndexOf(' ')), line);
                        }

                    // get sorted words
                    List<string> sortedWords = new List<string>(wordLine.Count);
                    sortedWords.AddRange(wordLine.Keys);
                    sortedWords.Sort();

                    // write lines sorted by word
                    foreach (string word in sortedWords)
                        tempFile.WriteLine(wordLine[word]);

                    tempFile.Close();

                    // replace original index file with properly sorted one
                    System.IO.File.Delete(indexPath);
                    System.IO.File.Move(tempPath, indexPath);
                }

                // create flag file, indicating that we've sorted the data
                StreamWriter sortFlagFile = new StreamWriter(sortFlagPath);
                sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API.");
                sortFlagFile.Close();
            }
            #endregion

            #region engine init
            if (inMemory)
            {
                // pass 1:  get total number of synsets
                int totalSynsets = 0;
                foreach (string dataPath in dataPaths)
                {
                    // scan synset data file for lines that don't start with a space...these are synset definition lines
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                            ++totalSynsets;
                    }
                }

                // pass 2:  create synset shells (pos and offset only)
                _idSynset = new Dictionary<string, SynSet>(totalSynsets);
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // get offset and create synset shell
                            int offset = int.Parse(line.Substring(0, firstSpace));
                            SynSet synset = new SynSet(pos, offset, null);

                            _idSynset.Add(synset.ID, synset);
                        }
                    }
                }

                // pass 3:  instantiate synsets (hooks up relations, set glosses, etc.)
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                            // instantiate synset defined on current line, using the instantiated synsets for all references
                            _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset);
                    }
                }

                // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets
                _posWordSynSets = new Dictionary<POS, Dictionary<string, Set<SynSet>>>();
                foreach (string indexPath in indexPaths)
                {
                    POS pos = GetFilePOS(indexPath);

                    _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary<string, Set<SynSet>>));

                    // scan word index file, skipping header lines
                    StreamReader indexFile = new StreamReader(indexPath);
                    string line;
                    while (indexFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // grab word and synset shells, along with the most common synset
                            string word = line.Substring(0, firstSpace);
                            SynSet mostCommonSynSet;
                            Set<SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null);

                            // set flag on most common synset if it's ambiguous
                            if (synsets.Count > 1)
                                _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word);

                            // use reference to the synsets that we instantiated in our three-pass routine above
                            _posWordSynSets[pos].Add(word, new Set<SynSet>(synsets.Count));
                            foreach (SynSet synset in synsets)
                                _posWordSynSets[pos][word].Add(_idSynset[synset.ID]);
                        }
                    }
                }
            }
            else
            {
                // open binary search streams for index files
                _posIndexWordSearchStream = new Dictionary<POS, BinarySearchTextStream>();
                foreach (string indexPath in indexPaths)
                {
                    // create binary search stream for index file
                    BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine)
                        {
                            // if we landed on the header text, search further down
                            if (currentLine[0] == ' ')
                                return 1;

                            // get word on current line
                            string currentWord = currentLine.Substring(0, currentLine.IndexOf(' '));

                            // compare searched-for word to the current word
                            return ((string)searchWord).CompareTo(currentWord);
                        }));

                    // add search stream for current POS
                    _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream);
                }

                // open readers for synset data files
                _posSynSetDataFile = new Dictionary<POS, StreamReader>();
                foreach (string dataPath in dataPaths)
                    _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath));
            }
            #endregion
        }
        /// <summary>
        /// Gets similarity of two synsets
        /// </summary>
        /// <param name="synset1">First synset</param>
        /// <param name="synset2">Second synset</param>
        /// <param name="strategy">Strategy to use. All strategies named WuPalmer1994* will produce the same result since only two synsets
        /// are available.</param>
        /// <param name="relations">Synset relations to follow when computing similarity (pass null for all relations)</param>
        /// <returns>Similarity</returns>
        public float GetSimilarity(SynSet synset1, SynSet synset2, Strategy strategy, params WordNetEngine.SynSetRelation[] relations)
        {
            if (relations == null)
                relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)).Cast<WordNetEngine.SynSetRelation>().ToArray();

            float similarity = 0;

            if (strategy.ToString().StartsWith("WuPalmer1994"))
            {
                // get the LCS along the similarity relations
                SynSet lcs = synset1.GetClosestMutuallyReachableSynset(synset2, relations);
                if (lcs == null)
                    similarity = 0;
                else
                {
                    // get depth of synsets
                    int lcsDepth = lcs.GetDepth(relations) + 1;
                    int synset1Depth = synset1.GetShortestPathTo(lcs, relations).Count - 1 + lcsDepth;
                    int synset2Depth = synset2.GetShortestPathTo(lcs, relations).Count - 1 + lcsDepth;

                    // get similarity
                    similarity = 2 * lcsDepth / (float)(synset1Depth + synset2Depth);
                }
            }
            else
                throw new NotImplementedException("Unrecognized strategy");

            if (similarity < 0 || similarity > 1)
                throw new Exception("Invalid similarity");

            return similarity;
        }
Exemple #6
0
        /// <summary>
        /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from 
        /// idSynset; otherwise, related synsets are created as shells.
        /// </summary>
        /// <param name="definition">Definition line of synset from data file</param>
        /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param>
        internal void Instantiate(string definition, Dictionary<string, SynSet> idSynset)
        {
            // don't re-instantiate
            if (_instantiated)
                throw new Exception("Synset has already been instantiated");

            /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that
             * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */
            int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1;
            if (lexicographerFileNumber <= 0)
                throw new Exception("Invalid lexicographer file name number. Should be >= 1.");

            _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber;

            // get number of words in the synset and the start character of the word list
            int wordStart;
            int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber);
            wordStart = definition.IndexOf(' ', wordStart) + 1;

            // get words in synset
            _words = new List<string>(numWords);
            for (int i = 0; i < numWords; ++i)
            {
                int wordEnd = definition.IndexOf(' ', wordStart + 1) - 1;
                int wordLen = wordEnd - wordStart + 1;
                string word = definition.Substring(wordStart, wordLen);
                if (word.Contains(' '))
                    throw new Exception("Unexpected space in word:  " + word);

                _words.Add(word);

                // skip lex_id field
                wordStart = definition.IndexOf(' ', wordEnd + 2) + 1;
            }

            // get gloss
            _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim();
            if (_gloss.Contains('|'))
                throw new Exception("Unexpected pipe in gloss");

            // get number and start of relations
            int relationCountField = 3 + (_words.Count * 2) + 1;
            int relationFieldStart;
            int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart));
            relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1;

            // grab each related synset
            _relationSynSets = new Dictionary<WordNetEngine.SynSetRelation, Set<SynSet>>();
            _lexicalRelations = new Dictionary<WordNetEngine.SynSetRelation, Dictionary<SynSet, Dictionary<int, Set<int>>>>();
            for (int relationNum = 0; relationNum < numRelations; ++relationNum)
            {
                string relationSymbol = null;
                int relatedSynSetOffset = -1;
                WordNetEngine.POS relatedSynSetPOS = WordNetEngine.POS.None;
                int sourceWordIndex = -1;
                int targetWordIndex = -1;

                // each relation has four columns
                for (int relationField = 0; relationField <= 3; ++relationField)
                {
                    int fieldEnd = definition.IndexOf(' ', relationFieldStart + 1) - 1;
                    int fieldLen = fieldEnd - relationFieldStart + 1;
                    string fieldValue = definition.Substring(relationFieldStart, fieldLen);

                    // relation symbol
                    if (relationField == 0)
                        relationSymbol = fieldValue;
                    // related synset offset
                    else if (relationField == 1)
                        relatedSynSetOffset = int.Parse(fieldValue);
                    // related synset POS
                    else if (relationField == 2)
                        relatedSynSetPOS = GetPOS(fieldValue);
                    // source/target word for lexical relation
                    else if (relationField == 3)
                    {
                        sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber);
                        targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber);
                    }
                    else
                        throw new Exception();

                    relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1;
                }

                // get related synset...create shell if we don't have a lookup
                SynSet relatedSynSet;
                if (idSynset == null)
                    relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine);
                // look up related synset directly
                else
                    relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset];

                // get relation
                WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol);

                // add semantic relation if we have neither a source nor a target word index
                if (sourceWordIndex == 0 && targetWordIndex == 0)
                {
                    _relationSynSets.EnsureContainsKey(relation, typeof(Set<SynSet>));
                    _relationSynSets[relation].Add(relatedSynSet);
                }
                // add lexical relation
                else
                {
                    _lexicalRelations.EnsureContainsKey(relation, typeof(Dictionary<SynSet, Dictionary<int, Set<int>>>));
                    _lexicalRelations[relation].EnsureContainsKey(relatedSynSet, typeof(Dictionary<int, Set<int>>));
                    _lexicalRelations[relation][relatedSynSet].EnsureContainsKey(sourceWordIndex, typeof(Set<int>));

                    if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex))
                        _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex);
                }
            }

            // release the wordnet engine if we have one...don't need it anymore
            if (_wordNetEngine != null)
                _wordNetEngine = null;

            _instantiated = true;
        }
Exemple #7
0
        /// <summary>
        /// Gets the shortest path from the current synset to another, following the given synset relations.
        /// </summary>
        /// <param name="destination">Destination synset</param>
        /// <param name="relations">Relations to follow, or null for all relations.</param>
        /// <returns>Synset path, or null if none exists.</returns>
        public List<SynSet> GetShortestPathTo(SynSet destination, IEnumerable<WordNetEngine.SynSetRelation> relations)
        {
            if (relations == null)
                relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)) as WordNetEngine.SynSetRelation[];

            // make sure the backpointer on the current synset is null - can't predict what other functions might do
            _searchBackPointer = null;

            // avoid cycles
            Set<SynSet> synsetsEncountered = new Set<SynSet>();
            synsetsEncountered.Add(this);

            // start search queue
            Queue<SynSet> searchQueue = new Queue<SynSet>();
            searchQueue.Enqueue(this);

            // run search
            List<SynSet> path = null;
            while (searchQueue.Count > 0 && path == null)
            {
                SynSet currSynSet = searchQueue.Dequeue();

                // see if we've finished the search
                if (currSynSet == destination)
                {
                    // gather synsets along path
                    path = new List<SynSet>();
                    while (currSynSet != null)
                    {
                        path.Add(currSynSet);
                        currSynSet = currSynSet.SearchBackPointer;
                    }

                    // reverse for the correct order
                    path.Reverse();
                }
                // expand the search one level
                else
                    foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false))
                        if (!synsetsEncountered.Contains(synset))
                        {
                            synset.SearchBackPointer = currSynSet;
                            searchQueue.Enqueue(synset);

                            synsetsEncountered.Add(synset);
                        }
            }

            // null-out all search backpointers
            foreach (SynSet synset in synsetsEncountered)
                synset.SearchBackPointer = null;

            return path;
        }
Exemple #8
0
        /// <summary>
        /// Gets the closest synset that is reachable from the current and another synset along the given relations. For example, 
        /// given two synsets and the Hypernym relation, this will return the lowest synset that is a hypernym of both synsets. If 
        /// the hypernym hierarchy forms a tree, this will be the lowest common ancestor.
        /// </summary>
        /// <param name="synset">Other synset</param>
        /// <param name="relations">Relations to follow</param>
        /// <returns>Closest mutually reachable synset</returns>
        public SynSet GetClosestMutuallyReachableSynset(SynSet synset, IEnumerable<WordNetEngine.SynSetRelation> relations)
        {
            // avoid cycles
            Set<SynSet> synsetsEncountered = new Set<SynSet>();
            synsetsEncountered.Add(this);

            // start search queue
            Queue<SynSet> searchQueue = new Queue<SynSet>();
            searchQueue.Enqueue(this);

            // run search
            SynSet closest = null;
            while (searchQueue.Count > 0 && closest == null)
            {
                SynSet currSynSet = searchQueue.Dequeue();

                /* check for a path between the given synset and the current one. if such a path exists, the current
                 * synset is the closest mutually reachable synset. */
                if (synset.GetShortestPathTo(currSynSet, relations) != null)
                    closest = currSynSet;
                // otherwise, expand the search along the given relations
                else
                    foreach (SynSet relatedSynset in currSynSet.GetRelatedSynSets(relations, false))
                        if (!synsetsEncountered.Contains(relatedSynset))
                        {
                            searchQueue.Enqueue(relatedSynset);
                            synsetsEncountered.Add(relatedSynset);
                        }
            }

            return closest;
        }
Exemple #9
0
        private void synSets_MouseDoubleClick(object sender, MouseEventArgs e)
        {
            if (synSets.SelectedItem == null)
                return;

            SynSet s = (SynSet)synSets.SelectedItem;
            if (_semSimSs1 == null)
            {
                _semSimSs1 = s;
                ss1.Text = _semSimSs1.ToString() + " (double-click to remove)";
            }
            else if (_semSimSs2 == null)
            {
                _semSimSs2 = s;
                ss2.Text = _semSimSs2.ToString()  + " (double-click to remove)";
            }
            else
                MessageBox.Show("Please remove one of the synsets selected below (double-click it)");

            computeSemSim.Enabled = _semSimSs1 != null && _semSimSs2 != null;
        }
Exemple #10
0
 private void ss2_DoubleClick(object sender, EventArgs e)
 {
     ss2.Text = _origSsLbl;
     _semSimSs2 = null;
     computeSemSim.Enabled = false;
 }