/// <summary>
 /// Constructor
 /// </summary>
 /// <param name="wordNetEngine">WordNet engine to use</param>
 public WordNetSimilarityModel(WordNetEngine wordNetEngine)
 {
     _wordNetEngine = wordNetEngine;
 }
Beispiel #2
0
        /// <summary>
        /// Instantiates the current synset. If idSynset is non-null, related synsets references are set to those from
        /// idSynset; otherwise, related synsets are created as shells.
        /// </summary>
        /// <param name="definition">Definition line of synset from data file</param>
        /// <param name="idSynset">Lookup for related synsets. If null, all related synsets will be created as shells.</param>
        internal void Instantiate(string definition, Dictionary <string, SynSet> idSynset)
        {
            // don't re-instantiate
            if (_instantiated)
            {
                throw new Exception("Synset has already been instantiated");
            }

            /* get lexicographer file name...the enumeration lines up precisely with the wordnet spec (see the lexnames file) except that
             * it starts with None, so we need to add 1 to the definition line's value to get the correct file name */
            int lexicographerFileNumber = int.Parse(GetField(definition, 1)) + 1;

            if (lexicographerFileNumber <= 0)
            {
                throw new Exception("Invalid lexicographer file name number. Should be >= 1.");
            }

            _lexicographerFileName = (WordNetEngine.LexicographerFileName)lexicographerFileNumber;

            // get number of words in the synset and the start character of the word list
            int wordStart;
            int numWords = int.Parse(GetField(definition, 3, out wordStart), NumberStyles.HexNumber);

            wordStart = definition.IndexOf(' ', wordStart) + 1;

            // get words in synset
            _words = new List <string>(numWords);
            for (int i = 0; i < numWords; ++i)
            {
                int    wordEnd = definition.IndexOf(' ', wordStart + 1) - 1;
                int    wordLen = wordEnd - wordStart + 1;
                string word    = definition.Substring(wordStart, wordLen);
                if (Enumerable.Contains(word, ' '))                 //ZK changed to static method
                {
                    throw new Exception("Unexpected space in word:  " + word);
                }

                _words.Add(word);

                // skip lex_id field
                wordStart = definition.IndexOf(' ', wordEnd + 2) + 1;
            }

            // get gloss
            _gloss = definition.Substring(definition.IndexOf('|') + 1).Trim();
            if (Enumerable.Contains(_gloss, '|')) //ZK changed to static method
            {
                throw new Exception("Unexpected pipe in gloss");
            }

            // get number and start of relations
            int relationCountField = 3 + (_words.Count * 2) + 1;
            int relationFieldStart;
            int numRelations = int.Parse(GetField(definition, relationCountField, out relationFieldStart));

            relationFieldStart = definition.IndexOf(' ', relationFieldStart) + 1;

            // grab each related synset
            _relationSynSets  = new Dictionary <WordNetEngine.SynSetRelation, Set <SynSet> >();
            _lexicalRelations = new Dictionary <WordNetEngine.SynSetRelation, Dictionary <SynSet, Dictionary <int, Set <int> > > >();
            for (int relationNum = 0; relationNum < numRelations; ++relationNum)
            {
                string            relationSymbol      = null;
                int               relatedSynSetOffset = -1;
                WordNetEngine.POS relatedSynSetPOS    = WordNetEngine.POS.None;
                int               sourceWordIndex     = -1;
                int               targetWordIndex     = -1;

                // each relation has four columns
                for (int relationField = 0; relationField <= 3; ++relationField)
                {
                    int    fieldEnd   = definition.IndexOf(' ', relationFieldStart + 1) - 1;
                    int    fieldLen   = fieldEnd - relationFieldStart + 1;
                    string fieldValue = definition.Substring(relationFieldStart, fieldLen);

                    // relation symbol
                    if (relationField == 0)
                    {
                        relationSymbol = fieldValue;
                    }
                    // related synset offset
                    else if (relationField == 1)
                    {
                        relatedSynSetOffset = int.Parse(fieldValue);
                    }
                    // related synset POS
                    else if (relationField == 2)
                    {
                        relatedSynSetPOS = GetPOS(fieldValue);
                    }
                    // source/target word for lexical relation
                    else if (relationField == 3)
                    {
                        sourceWordIndex = int.Parse(fieldValue.Substring(0, 2), NumberStyles.HexNumber);
                        targetWordIndex = int.Parse(fieldValue.Substring(2), NumberStyles.HexNumber);
                    }
                    else
                    {
                        throw new Exception();
                    }

                    relationFieldStart = definition.IndexOf(' ', relationFieldStart + 1) + 1;
                }

                // get related synset...create shell if we don't have a lookup
                SynSet relatedSynSet;
                if (idSynset == null)
                {
                    relatedSynSet = new SynSet(relatedSynSetPOS, relatedSynSetOffset, _wordNetEngine);
                }
                // look up related synset directly
                else
                {
                    relatedSynSet = idSynset[relatedSynSetPOS + ":" + relatedSynSetOffset];
                }

                // get relation
                WordNetEngine.SynSetRelation relation = WordNetEngine.GetSynSetRelation(_pos, relationSymbol);

                // add semantic relation if we have neither a source nor a target word index
                if (sourceWordIndex == 0 && targetWordIndex == 0)
                {
                    DictionaryExtensions.EnsureContainsKey(_relationSynSets, relation, typeof(Set <SynSet>));                    //ZK changed to static method
                    _relationSynSets[relation].Add(relatedSynSet);
                }
                // add lexical relation
                else
                {
                    DictionaryExtensions.EnsureContainsKey(_lexicalRelations, relation, typeof(Dictionary <SynSet, Dictionary <int, Set <int> > >));           //ZK changed to static method
                    DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation], relatedSynSet, typeof(Dictionary <int, Set <int> >));                  //ZK changed to static method
                    DictionaryExtensions.EnsureContainsKey(_lexicalRelations[relation][relatedSynSet], sourceWordIndex, typeof(Set <int>));                    //ZK changed to static method

                    if (!_lexicalRelations[relation][relatedSynSet][sourceWordIndex].Contains(targetWordIndex))
                    {
                        _lexicalRelations[relation][relatedSynSet][sourceWordIndex].Add(targetWordIndex);
                    }
                }
            }

            // release the wordnet engine if we have one...don't need it anymore
            if (_wordNetEngine != null)
            {
                _wordNetEngine = null;
            }

            _instantiated = true;
        }
        /// <summary>
        /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset
        /// members initialized. These members are enough to look up the full synset within the corresponding data file. This
        /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the
        /// corresponding parameter.
        /// </summary>
        /// <param name="wordIndexLine">Word index line from which to get synset shells</param>
        /// <param name="pos">POS of the given index line</param>
        /// <param name="mostCommonSynSet">Returns the most common synset for the word</param>
        /// <param name="wordNetEngine">WordNetEngine to pass to the constructor of each synset shell</param>
        /// <returns>Synset shells for the given index line</returns>
        private static Set <SynSet> GetSynSetShells(string wordIndexLine, POS pos, out SynSet mostCommonSynSet, WordNetEngine wordNetEngine)
        {
            Set <SynSet> synsets = new Set <SynSet>();

            mostCommonSynSet = null;

            // get number of synsets
            string[] parts      = wordIndexLine.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int      numSynSets = int.Parse(parts[2]);

            // grab each synset shell, from last to first
            int firstOffsetIndex = parts.Length - numSynSets;

            for (int i = parts.Length - 1; i >= firstOffsetIndex; --i)
            {
                // create synset
                int offset = int.Parse(parts[i]);

                // add synset to collection
                SynSet synset = new SynSet(pos, offset, wordNetEngine);
                synsets.Add(synset);

                // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset
                if (i == firstOffsetIndex)
                {
                    mostCommonSynSet = synset;
                }
            }

            if (mostCommonSynSet == null)
            {
                throw new Exception("Failed to get most common synset");
            }

            return(synsets);
        }