protected internal override string[] GetExceptionForms(string lemma, string partOfSpeech) { string line = BinarySearch(lemma, mDataFileDictionary[partOfSpeech].ExceptionFile); if (line != null) { List<string> exceptionForms = new List<string>(); Tokenizer tokenizer = new Tokenizer(line); string skipWord = tokenizer.NextToken(); string word = tokenizer.NextToken(); while (word != null) { exceptionForms.Add(word); word = tokenizer.NextToken(); } return exceptionForms.ToArray(); } return mEmpty; }
private IndexWord CreateIndexWord(string partOfSpeech, string line) { Tokenizer tokenizer = new Tokenizer(line); string word = tokenizer.NextToken().Replace('_', ' '); string redundantPartOfSpeech = tokenizer.NextToken(); int senseCount = int.Parse(tokenizer.NextToken()); int relationTypeCount = int.Parse(tokenizer.NextToken()); string[] relationTypes = null; if (relationTypeCount > 0) { relationTypes = new string[relationTypeCount]; for (int currentRelationType = 0; currentRelationType < relationTypeCount; currentRelationType++) { relationTypes[currentRelationType] = tokenizer.NextToken(); } } int redundantSenseCount = int.Parse(tokenizer.NextToken()); int tagSenseCount = int.Parse(tokenizer.NextToken()); int[] synsetOffsets = null; if (senseCount > 0) { synsetOffsets = new int[senseCount]; for (int currentOffset = 0; currentOffset < senseCount; currentOffset++) { synsetOffsets[currentOffset] = int.Parse(tokenizer.NextToken()); } } return new IndexWord(word, partOfSpeech, relationTypes, synsetOffsets, tagSenseCount); }
protected internal override Synset CreateSynset(string partOfSpeech, int synsetOffset) { StreamReader dataFile = mDataFileDictionary[partOfSpeech].DataFile; dataFile.DiscardBufferedData(); dataFile.BaseStream.Seek(synsetOffset, SeekOrigin.Begin); string record = dataFile.ReadLine(); Tokenizer tokenizer = new Tokenizer(record); int offset = int.Parse(tokenizer.NextToken()); string lexicographerFile = mLexicographerFiles[int.Parse(tokenizer.NextToken())]; string synsetType = tokenizer.NextToken(); int wordCount = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); string[] words = new string[wordCount]; for (int iCurrentWord = 0; iCurrentWord < wordCount; iCurrentWord++) { words[iCurrentWord] = tokenizer.NextToken().Replace("_", " "); int uniqueID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); } int relationCount = int.Parse(tokenizer.NextToken()); Relation[] relations = new Relation[relationCount]; for (int currentRelation = 0; currentRelation < relationCount; currentRelation++) { string relationTypeKey = tokenizer.NextToken(); // if (fpos.name=="adj" && sstype==AdjSynSetType.DontKnow) // { // if (ptrs[j].ptp.mnemonic=="ANTPTR") // sstype = AdjSynSetType.DirectAnt; // else if (ptrs[j].ptp.mnemonic=="PERTPTR") // sstype = AdjSynSetType.Pertainym; // } int targetSynsetOffset = int.Parse(tokenizer.NextToken()); string targetPartOfSpeech = tokenizer.NextToken(); switch (targetPartOfSpeech) { case "n": targetPartOfSpeech = "noun"; break; case "v": targetPartOfSpeech = "verb"; break; case "a": case "s": targetPartOfSpeech = "adjective"; break; case "r": targetPartOfSpeech = "adverb"; break; } int sourceTarget = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); if (sourceTarget == 0) { relations[currentRelation] = new Relation(this, (RelationType)mRelationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech); } else { int sourceWord = sourceTarget >> 8; int targetWord = sourceTarget & 0xff; relations[currentRelation] = new Relation(this, (RelationType)mRelationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech, sourceWord, targetWord); } } string frameData = tokenizer.NextToken(); if (frameData != "|") { int frameCount = int.Parse(frameData); for (int currentFrame = 0; currentFrame < frameCount; currentFrame++) { frameData = tokenizer.NextToken(); // + int frameNumber = int.Parse(tokenizer.NextToken()); int wordID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); } frameData = tokenizer.NextToken(); } string gloss = record.Substring(record.IndexOf('|') + 1); Synset synset = new Synset(synsetOffset, gloss, words, lexicographerFile, relations); return synset; }