Ejemplo n.º 1
0
        /// <summary>
        /// Gets a synset
        /// </summary>
        /// <param name="synsetID">ID of synset in the format returned by SynSet.ID (i.e., POS:Offset)</param>
        /// <returns>SynSet</returns>
        public SynSet GetSynSet(string synsetID)
        {
            SynSet synset;

            if (_inMemory)
            {
                synset = _idSynset[synsetID];
            }
            else
            {
                // get POS and offset
                int colonLoc = synsetID.IndexOf(':');
                POS pos      = (POS)Enum.Parse(typeof(POS), synsetID.Substring(0, colonLoc));
                int offset   = int.Parse(synsetID.Substring(colonLoc + 1));

                // create shell and then instantiate
                synset = new SynSet(pos, offset, this);
                synset.Instantiate();
            }

            return(synset);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset
        /// members initialized. These members are enough to look up the full synset within the corresponding data file. This
        /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the
        /// corresponding parameter.
        /// </summary>
        /// <param name="wordIndexLine">Word index line from which to get synset shells</param>
        /// <param name="pos">POS of the given index line</param>
        /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param>
        /// <returns>Synset shells for the given index line</returns>
        private static List <SynSet> GetSynSetShells(string wordIndexLine, POS pos, WordNetEngine wordNetEngine)
        {
            List <SynSet> synsets = new List <SynSet>();

            // get number of synsets
            string[] parts      = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int      numSynSets = int.Parse(parts[2]);

            // grab each synset shell, from last to first
            int firstOffsetIndex = parts.Length - numSynSets;

            for (int i = parts.Length - 1; i >= firstOffsetIndex; --i)
            {
                // create synset
                int offset = int.Parse(parts[i]);

                // add synset to collection
                SynSet synset = new SynSet(pos, offset, wordNetEngine);
                synsets.Add(synset);
            }

            return(synsets);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Gets the shortest path from the current synset to another, following the given synset relations.
        /// </summary>
        /// <param name="destination">Destination synset</param>
        /// <param name="relations">Relations to follow, or null for all relations.</param>
        /// <returns>Synset path, or null if none exists.</returns>
        public List <SynSet> GetShortestPathTo(SynSet destination, IEnumerable <WordNetEngine.SynSetRelation> relations)
        {
            if (relations == null)
            {
                relations = Enum.GetValues(typeof(WordNetEngine.SynSetRelation)) as WordNetEngine.SynSetRelation[];
            }

            // make sure the backpointer on the current synset is null - can't predict what other functions might do
            _searchBackPointer = null;

            // avoid cycles
            List <SynSet> synsetsEncountered = new List <SynSet>();

            synsetsEncountered.Add(this);

            // start search queue
            Queue <SynSet> searchQueue = new Queue <SynSet>();

            searchQueue.Enqueue(this);

            // run search
            List <SynSet> path = null;

            while (searchQueue.Count > 0 && path == null)
            {
                SynSet currSynSet = searchQueue.Dequeue();

                // see if we've finished the search
                if (currSynSet == destination)
                {
                    // gather synsets along path
                    path = new List <SynSet>();
                    while (currSynSet != null)
                    {
                        path.Add(currSynSet);
                        currSynSet = currSynSet.SearchBackPointer;
                    }

                    // reverse for the correct order
                    path.Reverse();
                }
                // expand the search one level
                else
                {
                    foreach (SynSet synset in currSynSet.GetRelatedSynSets(relations, false))
                    {
                        if (!synsetsEncountered.Contains(synset))
                        {
                            synset.SearchBackPointer = currSynSet;
                            searchQueue.Enqueue(synset);

                            synsetsEncountered.Add(synset);
                        }
                    }
                }
            }

            // null-out all search backpointers
            foreach (SynSet synset in synsetsEncountered)
            {
                synset.SearchBackPointer = null;
            }
            return(path);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from
        /// idSynset; otherwise, related synsets are created as shells.
        /// </summary>
        /// <param name="definition">Definition line of synset from data file</param>
        /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param>
        internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset)
        {
            // don't re-instantiate
            if (!_instantiated)
            {
                // get number of words in the synset and the start character of the word list
                int wordStart;
                int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber);
                wordStart = definition.IndexOf(' ', wordStart) + 1;

                _words = new List <string>(numWords);

                // get words in synset
                for (int i = 0; i < numWords; ++i)
                {
                    int    wordEnd = definition.IndexOf(' ', wordStart + 1) - 1;
                    int    wordLen = wordEnd - wordStart + 1;
                    string word    = definition.Substring(wordStart, wordLen);
                    _words.Add(word);

                    // get lex_id
                    lex_id = Convert.ToInt32(definition.Substring(definition.IndexOf(' ') + 1, 2));

                    // skip lex_id field
                    wordStart = definition.IndexOf(' ', wordEnd + 2) + 1;
                }

                // get gloss
                _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim();

                // get number and start of relations
                int relationCountField = 3 + (_words.Count * 2) + 1;
                int relationFieldStart;
                int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart));
                relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1;

                // grab each related synset
                _relationSynSets  = new Dictionary <WordNetEngine.SynSetRelation, List <SynSet> >();
                _lexicalRelations = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <SynSet, Dictionary <int, List <int> > > >();
                for (int relationNum = 0; relationNum < numRelations; ++relationNum)
                {
                    string            relationSymbol      = null;
                    int               relatedSynSetOffset = -1;
                    WordNetEngine.POS relatedSynSetPOS    = WordNetEngine.POS.None;
                    int               sourceWordIndex     = -1;
                    int               targetWordIndex     = -1;

                    // each relation has four columns
                    for (int relationField = 0; relationField <= 3; ++relationField)
                    {
                        int    fieldEnd   = definition.IndexOf(' ', relationFieldStart + 1) - 1;
                        int    fieldLen   = fieldEnd - relationFieldStart + 1;
                        string fieldValue = definition.Substring(relationFieldStart, fieldLen);

                        // relation symbol
                        if (relationField == 0)
                        {
                            relationSymbol = fieldValue;
                        }
                        // related synset offset
                        else if (relationField == 1)
                        {
                            relatedSynSetOffset = int.Parse(fieldValue);
                        }
                        // related synset POS
                        else if (relationField == 2)
                        {
                            relatedSynSetPOS = GetPOS(fieldValue);
                        }
                        // source/target word for lexical relation
                        else if (relationField == 3)
                        {
                            sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber);
                            targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber);
                        }
                        else
                        {
                            throw new Exception();
                        }

                        relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1;
                    }

                    // get related synset...create shell if we don't have a lookup
                    SynSet relatedSynSet;
                    if (idSynset == null)
                    {
                        relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine);
                    }
                    // look up related synset directly
                    else
                    {
                        relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset];
                    }

                    // get relation
                    WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol);

                    // add semantic relation if we have neither a source nor a target word index
                    if (sourceWordIndex == 0 && targetWordIndex == 0)
                    {
                        _relationSynSets.EnsureContainsKey(relation, typeof(List <SynSet>));
                        _relationSynSets[relation].Add(relatedSynSet);
                    }
                    // add lexical relation
                    else
                    {
                        _lexicalRelations.EnsureContainsKey(relation, typeof(Dictionary <SynSet, Dictionary <int, List <int> > >));
                        _lexicalRelations[relation].EnsureContainsKey(relatedSynSet, typeof(Dictionary <int, List <int> >));
                        _lexicalRelations[relation][relatedSynSet].EnsureContainsKey(sourceWordIndex, typeof(List <int>));

                        if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex))
                        {
                            _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex);
                        }
                    }
                }
                _instantiated = true;
            }

            // release the wordnet engine if we have one...don't need it anymore
            if (_wordNetEngine != null)
            {
                _wordNetEngine = null;
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param>
        /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space
        /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient
        /// binary search algorithm.</param>
        public WordNetEngine(string wordNetDirectory, bool inMemory)
        {
            _wordNetDirectory         = wordNetDirectory;
            _inMemory                 = inMemory;
            _posIndexWordSearchStream = null;
            _posSynSetDataFile        = null;

            if (!System.IO.Directory.Exists(_wordNetDirectory))
            {
                throw new DirectoryNotFoundException("Error 502");
            }

            // get data and index paths
            string[] dataPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "data.adj"),
                Path.Combine(_wordNetDirectory, "data.adv"),
                Path.Combine(_wordNetDirectory, "data.noun"),
                Path.Combine(_wordNetDirectory, "data.verb")
            };

            string[] indexPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "index.adj"),
                Path.Combine(_wordNetDirectory, "index.adv"),
                Path.Combine(_wordNetDirectory, "index.noun"),
                Path.Combine(_wordNetDirectory, "index.verb")
            };

            // make sure all files exist
            foreach (string path in dataPaths.Union(indexPaths))
            {
                if (!System.IO.File.Exists(path))
                {
                    throw new FileNotFoundException("Error 502");
                }
            }

            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
            // *                                                               *
            // *   UPDATE [HASSAN:11/03/2017]: The lemmatizer requires except- *
            // *   tion dictionary for each POS to be loaded as stream         *
            // *                                                               *
            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

            LemmaExcptionsFile = new Dictionary <string, StreamReader>(4);
            LemmaExcptionsFile.Add("noun", new StreamReader(wordNetDirectory + "\\noun.exc"));
            LemmaExcptionsFile.Add("verb", new StreamReader(wordNetDirectory + "\\verb.exc"));

            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
            // *                                                               *
            // *   UPDATE [HASSAN:11/07/2017]: The lemmatizer requires except- *
            // *   tion dictionary for noun only in the context of SemCluster  *
            // *    tool. In order to implement lemmatizer for all 4-POS tags  *
            // *    you will need the following:                               *
            // *    1) Uncomment the following lines.                          *
            // *    2) Uncomment the lines in suffixMap variable.              *
            // *    3) Uncomment the GetSynsets Switch section                 *
            // *    4) Add Exception files for each POS in the data folder     *
            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

            //LemmaExcptionsFile.Add("adjective", new StreamReader(wordNetDirectory + "\\adj.exc"));
            //LemmaExcptionsFile.Add("adverb", new StreamReader(wordNetDirectory + "\\adv.exc"));

            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
            // *                                                               *
            // *   UPDATE [HASSAN:28/01/2016]: The #region index file sorting  *
            // *   has been removed here,since its required to run only for    *
            // *   first program execution                                     *
            // *                                                               *
            // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

            #region engine init
            if (inMemory)
            {
                // pass 1:  get total number of synsets
                int totalSynsets = 0;
                foreach (string dataPath in dataPaths)
                {
                    // scan synset data file for lines that don't start with a space...these are synset definition lines
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            ++totalSynsets;
                        }
                    }
                }

                // pass 2:  create synset shells (pos and offset only)
                _idSynset = new Dictionary <string, SynSet>(totalSynsets);
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // get offset and create synset shell
                            int    offset = int.Parse(line.Substring(0, firstSpace));
                            SynSet synset = new SynSet(pos, offset, null);

                            _idSynset.Add(synset.ID, synset);
                        }
                    }
                }

                // pass 3:  instantiate synsets (hooks up relations, set glosses, etc.)
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // instantiate synset defined on current line, using the instantiated synsets for all references
                            _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset);
                        }
                    }
                }

                // organize synsets by pos and words
                _posWordSynSets = new Dictionary <POS, Dictionary <string, List <SynSet> > >();
                foreach (string indexPath in indexPaths)
                {
                    POS pos = GetFilePOS(indexPath);

                    _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary <string, List <SynSet> >));

                    // scan word index file, skipping header lines
                    StreamReader indexFile = new StreamReader(indexPath);
                    string       line;
                    while (indexFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // grab word and synset shells
                            string        word    = line.Substring(0, firstSpace);
                            List <SynSet> synsets = GetSynSetShells(line, pos, null);

                            // use reference to the synsets that we instantiated in our three-pass routine above
                            _posWordSynSets[pos].Add(word, new List <SynSet>(synsets.Count));
                            foreach (SynSet synset in synsets)
                            {
                                _posWordSynSets[pos][word].Add(_idSynset[synset.ID]);
                            }
                        }
                    }
                }
            }
            else
            {
                // open binary search streams for index files
                _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>();
                foreach (string indexPath in indexPaths)
                {
                    // create binary search stream for index file
                    BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(

                                                                                         delegate(string searchWord, string currentLine)
                    {
                        // if we landed on the header text, search further down
                        if (currentLine[0] == ' ')
                        {
                            return(1);
                        }

                        // get word on current line
                        string currentWord = currentLine.Substring(0, currentLine.IndexOf(' '));

                        // compare searched-for word to the current word
                        return(((string)searchWord).CompareTo(currentWord));
                    }

                                                                                         ));

                    // add search stream for current POS
                    _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream);
                }
                // open readers for synset data files
                _posSynSetDataFile = new Dictionary <POS, StreamReader>();
                foreach (string dataPath in dataPaths)
                {
                    _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath));
                }
            }
            #endregion
        }