private Instance createSingleWhyInstance(FastVector fvWhy, Token candidate) { //first word-n attribute number int wordsBeforeFirstAttributeNumber = 7; //first pos-n attribute number int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore + whyWordsAfter; //word+1 attribute number int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore; //pos+1 attribute number int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whyWordsBefore; int totalAttributeCount = wordsBeforeFirstAttributeNumber + whyWordsBefore * 2 + whyWordsAfter * 2 + 1; Instance whyCandidate = new DenseInstance(totalAttributeCount); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(0), candidate.Value); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(1), candidate.Value.Split(' ').Count()); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(2), candidate.Sentence); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(3), candidate.Score); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(4), candidate.NumWho); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(5), candidate.NumWhen); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(6), candidate.NumWhere); for (int i = whyWordsBefore; i > 0; i--) { if (candidate.Position - i - 1 >= 0) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value); if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech); } } } for (int i = 0; i < whyWordsAfter; i++) { if (candidate.Position + i < articleCurrent.Count) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value); if (articleCurrent[candidate.Position + i].PartOfSpeech != null) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech); } } } return whyCandidate; }
public List<Token> performTokenizationAndSS(String toBeTokenized) { List<Token> tokenizedString = new List<Token>(); var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(toBeTokenized)).toArray(); int sentenceCounter = 1; int positionCounter = 1; String[] abbreviationList = new String[] { "Dr", //Names "Dra", "Gng", "G", "Gg", "Bb", "Esq", "Jr", "Mr", "Mrs", "Ms", "Messrs", "Mmes", "Msgr", "Prof", "Rev", "Pres", "Sec", "Sr", "Fr", "St", "Hon", "Ave", //Streets "Aly", "Gen", //Military Rank "1Lt", "2Lt", "Cpt", "Maj", "Capt", "1stLt", "2ndLt", "Adm", "W01", "CW2", "CW3", "CW4", "CW5", "Col", "LtCol", "BG", "MG", "Sgt", "SSgt", "LCpl", "SgtMaj", "1stSgt", "1Sgt", "Pvt" }; foreach (java.util.ArrayList sentence in sentences) { String wordFinal = ""; foreach (var word in sentence) { var newToken = new Token(word.ToString(), positionCounter); newToken.Sentence = sentenceCounter; tokenizedString.Add(newToken); positionCounter++; if(!newToken.Value.Equals(".")) wordFinal = word.ToString(); } Boolean flag = true; foreach(String word in abbreviationList) { if (wordFinal.Equals(word)) flag = false; } if (flag) sentenceCounter++; } return tokenizedString; }
private Instance createSingleWhoInstance(FastVector fvWho, Token candidate) { //first word-n attribute number int wordsBeforeFirstAttributeNumber = 6; //first pos-n attribute number int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore + whoWordsAfter; //word+1 attribute number int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore; //pos+1 attribute number int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whoWordsBefore; int totalAttributeCount = wordsBeforeFirstAttributeNumber + whoWordsBefore * 2 + whoWordsAfter * 2 + 1; Instance whoCandidate = new DenseInstance(totalAttributeCount); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(0), candidate.Value); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(1), candidate.Value.Split(' ').Count()); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(2), candidate.Sentence); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(3), candidate.Position); double sentenceStartProximity = -1; foreach (List<Token> tokenList in segregatedArticleCurrent) { if (tokenList.Count > 0 && tokenList[0].Sentence == candidate.Sentence) { sentenceStartProximity = (double)(candidate.Position - tokenList[0].Position) / (double)tokenList.Count; break; } } if (sentenceStartProximity > -1) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(4), sentenceStartProximity); } whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(5), candidate.Frequency); for (int i = whoWordsBefore; i > 0; i--) { if (candidate.Position - i - 1 >= 0) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value); if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech); } } } for (int i = 0; i < whoWordsAfter; i++) { if (candidate.Position + i < articleCurrent.Count) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value); if (articleCurrent[candidate.Position + i].PartOfSpeech != null) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech); } } } return whoCandidate; }