Esempio n. 1
0
        private Instance createSingleWhyInstance(FastVector fvWhy, Token candidate)
        {
            //first word-n attribute number
            int wordsBeforeFirstAttributeNumber = 7;
            //first pos-n attribute number
            int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore + whyWordsAfter;
            //word+1 attribute number
            int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore;
            //pos+1 attribute number
            int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whyWordsBefore;

            int totalAttributeCount = wordsBeforeFirstAttributeNumber + whyWordsBefore * 2 + whyWordsAfter * 2 + 1;

            Instance whyCandidate = new DenseInstance(totalAttributeCount);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(0), candidate.Value);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(1), candidate.Value.Split(' ').Count());
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(2), candidate.Sentence);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(3), candidate.Score);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(4), candidate.NumWho);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(5), candidate.NumWhen);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(6), candidate.NumWhere);
            for (int i = whyWordsBefore; i > 0; i--)
            {
                if (candidate.Position - i - 1 >= 0)
                {
                    whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value);
                    if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null)
                    {
                        whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech);
                    }
                }
            }
            for (int i = 0; i < whyWordsAfter; i++)
            {
                if (candidate.Position + i < articleCurrent.Count)
                {
                    whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value);
                    if (articleCurrent[candidate.Position + i].PartOfSpeech != null)
                    {
                        whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech);
                    }
                }
            }
            return whyCandidate;
        }
Esempio n. 2
0
 public List<Token> performTokenizationAndSS(String toBeTokenized)
 {
     List<Token> tokenizedString = new List<Token>();
     var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(toBeTokenized)).toArray();
     int sentenceCounter = 1;
     int positionCounter = 1;
     String[] abbreviationList = new String[] {
     "Dr", //Names
     "Dra",
     "Gng",
     "G",
     "Gg",
     "Bb",
     "Esq",
     "Jr",
     "Mr",
     "Mrs",
     "Ms",
     "Messrs",
     "Mmes",
     "Msgr",
     "Prof",
     "Rev",
     "Pres",
     "Sec",
     "Sr",
     "Fr",
     "St",
     "Hon",
     "Ave", //Streets
     "Aly",
     "Gen", //Military Rank
     "1Lt",
     "2Lt",
     "Cpt",
     "Maj",
     "Capt",
     "1stLt",
     "2ndLt",
     "Adm",
     "W01",
     "CW2",
     "CW3",
     "CW4",
     "CW5",
     "Col",
     "LtCol",
     "BG",
     "MG",
     "Sgt",
     "SSgt",
     "LCpl",
     "SgtMaj",
     "1stSgt",
     "1Sgt",
     "Pvt"
     };
     foreach (java.util.ArrayList sentence in sentences)
     {
         String wordFinal = "";
         foreach (var word in sentence)
         {
             var newToken = new Token(word.ToString(), positionCounter);
             newToken.Sentence = sentenceCounter;
             tokenizedString.Add(newToken);
             positionCounter++;
             if(!newToken.Value.Equals("."))
                 wordFinal = word.ToString();
         }
         Boolean flag = true;
         foreach(String word in abbreviationList)
         {
             if (wordFinal.Equals(word))
                 flag = false;
         }
         if (flag)
             sentenceCounter++;
     }
     return tokenizedString;
 }
Esempio n. 3
0
        private Instance createSingleWhoInstance(FastVector fvWho, Token candidate)
        {
            //first word-n attribute number
            int wordsBeforeFirstAttributeNumber = 6;
            //first pos-n attribute number
            int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore + whoWordsAfter;
            //word+1 attribute number
            int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore;
            //pos+1 attribute number
            int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whoWordsBefore;

            int totalAttributeCount = wordsBeforeFirstAttributeNumber + whoWordsBefore * 2 + whoWordsAfter * 2 + 1;

            Instance whoCandidate = new DenseInstance(totalAttributeCount);
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(0), candidate.Value);
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(1), candidate.Value.Split(' ').Count());
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(2), candidate.Sentence);
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(3), candidate.Position);
            double sentenceStartProximity = -1;
            foreach (List<Token> tokenList in segregatedArticleCurrent)
            {
                if (tokenList.Count > 0 && tokenList[0].Sentence == candidate.Sentence)
                {
                    sentenceStartProximity = (double)(candidate.Position - tokenList[0].Position) / (double)tokenList.Count;
                    break;
                }
            }
            if (sentenceStartProximity > -1)
            {
                whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(4), sentenceStartProximity);
            }
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(5), candidate.Frequency);

            for (int i = whoWordsBefore; i > 0; i--)
            {
                if (candidate.Position - i - 1 >= 0)
                {
                    whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value);
                    if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null)
                    {
                        whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech);
                    }
                }
            }
            for (int i = 0; i < whoWordsAfter; i++)
            {
                if (candidate.Position + i < articleCurrent.Count)
                {
                    whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value);
                    if (articleCurrent[candidate.Position + i].PartOfSpeech != null)
                    {
                        whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech);
                    }
                }
            }
            return whoCandidate;
        }