コード例 #1
0
 protected internal override string[] GetExceptionForms(string lemma, string partOfSpeech)
 {
     string line = BinarySearch(lemma, mDataFileDictionary[partOfSpeech].ExceptionFile);
     if (line != null)
     {
         List<string> exceptionForms = new List<string>();
         Tokenizer tokenizer = new Tokenizer(line);
         string skipWord = tokenizer.NextToken();
         string word = tokenizer.NextToken();
         while (word != null)
         {
             exceptionForms.Add(word);
             word = tokenizer.NextToken();
         }
         return exceptionForms.ToArray();
     }
     return mEmpty;
 }
コード例 #2
0
        private IndexWord CreateIndexWord(string partOfSpeech, string line)
        {
            Tokenizer tokenizer = new Tokenizer(line);
            string word = tokenizer.NextToken().Replace('_', ' ');
            string redundantPartOfSpeech = tokenizer.NextToken();
            int senseCount = int.Parse(tokenizer.NextToken());

            int relationTypeCount = int.Parse(tokenizer.NextToken());
            string[] relationTypes = null;
            if (relationTypeCount > 0)
            {
                relationTypes = new string[relationTypeCount];
                for (int currentRelationType = 0; currentRelationType < relationTypeCount; currentRelationType++)
                {
                    relationTypes[currentRelationType] = tokenizer.NextToken();
                }
            }
            int redundantSenseCount = int.Parse(tokenizer.NextToken());
            int tagSenseCount = int.Parse(tokenizer.NextToken());

            int[] synsetOffsets = null;
            if (senseCount > 0)
            {
                synsetOffsets = new int[senseCount];
                for (int currentOffset = 0; currentOffset < senseCount; currentOffset++)
                {
                    synsetOffsets[currentOffset] = int.Parse(tokenizer.NextToken());
                }
            }
            return new IndexWord(word, partOfSpeech, relationTypes, synsetOffsets, tagSenseCount);
        }
コード例 #3
0
        protected internal override Synset CreateSynset(string partOfSpeech, int synsetOffset)
        {
            StreamReader dataFile = mDataFileDictionary[partOfSpeech].DataFile;
            dataFile.DiscardBufferedData();
            dataFile.BaseStream.Seek(synsetOffset, SeekOrigin.Begin);
            string record = dataFile.ReadLine();

            Tokenizer tokenizer = new Tokenizer(record);
            int offset = int.Parse(tokenizer.NextToken());
            string lexicographerFile = mLexicographerFiles[int.Parse(tokenizer.NextToken())];
            string synsetType = tokenizer.NextToken();
            int wordCount = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber);

            string[] words = new string[wordCount];
            for (int iCurrentWord = 0; iCurrentWord < wordCount; iCurrentWord++)
            {
                words[iCurrentWord] = tokenizer.NextToken().Replace("_", " ");
                int uniqueID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber);
            }

            int relationCount = int.Parse(tokenizer.NextToken());
            Relation[] relations = new Relation[relationCount];
            for (int currentRelation = 0; currentRelation < relationCount; currentRelation++)
            {
                string relationTypeKey = tokenizer.NextToken();
            //				if (fpos.name=="adj" && sstype==AdjSynSetType.DontKnow)
            //				{
            //					if (ptrs[j].ptp.mnemonic=="ANTPTR")
            //						sstype = AdjSynSetType.DirectAnt;
            //					else if (ptrs[j].ptp.mnemonic=="PERTPTR")
            //						sstype = AdjSynSetType.Pertainym;
            //				}
                int targetSynsetOffset = int.Parse(tokenizer.NextToken());
                string targetPartOfSpeech = tokenizer.NextToken();
                switch (targetPartOfSpeech)
                {
                    case "n":
                        targetPartOfSpeech = "noun";
                        break;
                    case "v":
                        targetPartOfSpeech = "verb";
                        break;
                    case "a":
                    case "s":
                        targetPartOfSpeech = "adjective";
                        break;
                    case "r":
                        targetPartOfSpeech = "adverb";
                        break;
                }

                int sourceTarget = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber);
                if (sourceTarget == 0)
                {
                    relations[currentRelation] = new Relation(this, (RelationType)mRelationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech);
                }
                else
                {
                    int sourceWord = sourceTarget >> 8;
                    int targetWord = sourceTarget & 0xff;
                    relations[currentRelation] = new Relation(this, (RelationType)mRelationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech, sourceWord, targetWord);
                }
            }
            string frameData = tokenizer.NextToken();
            if (frameData != "|")
            {
                int frameCount = int.Parse(frameData);
                for (int currentFrame = 0; currentFrame < frameCount; currentFrame++)
                {
                    frameData = tokenizer.NextToken(); // +
                    int frameNumber = int.Parse(tokenizer.NextToken());
                    int wordID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber);
                }
                frameData = tokenizer.NextToken();
            }
            string gloss = record.Substring(record.IndexOf('|') + 1);

            Synset synset = new Synset(synsetOffset, gloss, words, lexicographerFile, relations);
            return synset;
        }