예제 #1
0
        private int getCandidateByNer(String nerTag, int i, List<Candidate> candidates, List<Token> tokenizedArticle)
        {
            if (tokenizedArticle[i].NamedEntity.Equals(nerTag))
            {
                int startIndex = i;
                String strValue = tokenizedArticle[i].Value;
                int tempWs = tokenizedArticle[i].Frequency;

                while ((i + 1) < tokenizedArticle.Count && tokenizedArticle[i].NamedEntity == tokenizedArticle[i + 1].NamedEntity)
                {
                    i++;
                    if (tokenizedArticle[i].Value.Equals(",") || tokenizedArticle[i].Value.Equals("."))
                    {
                        strValue += tokenizedArticle[i].Value;
                    }
                    else
                    {
                        strValue += " " + tokenizedArticle[i].Value;
                    }
                    if (tokenizedArticle[i].Frequency > tempWs)
                    {
                        tempWs = tokenizedArticle[i].Frequency;
                    }
                }

                int endIndex = i;

                var newToken = new Candidate(strValue, tokenizedArticle[startIndex].Position, tokenizedArticle[endIndex].Position - tokenizedArticle[startIndex].Position);
                newToken.Sentence = tokenizedArticle[i].Sentence; // candidate.token[0].sentence;
                newToken.NamedEntity = tokenizedArticle[i].NamedEntity; // candidate.token[0].NamedEntity;
                newToken.PartOfSpeech = tokenizedArticle[i].PartOfSpeech; // candidate.token[0].NamedEntity;
                newToken.Frequency = tempWs; // candidate.token[0].Frequency;
                candidates.Add(newToken);

                //System.Console.WriteLine("CANDIDATE BY NER [{0}]: {1} (Position {2})", nerTag, newToken.Value, newToken.Position);
            }
            return i;
        }
예제 #2
0
        private int getCandidateByPos(String posTag, int i, List<Candidate> candidates, List<Token> tokenizedArticle)
        {
            if (i < tokenizedArticle.Count && tokenizedArticle[i].PartOfSpeech != null && tokenizedArticle[i].PartOfSpeech.Equals(posTag))
            {
                int startIndex = i;
                String strValue = tokenizedArticle[i].Value;
                int tempWs = tokenizedArticle[i].Frequency;

                while ((i + 1) < tokenizedArticle.Count && tokenizedArticle[i].PartOfSpeech == tokenizedArticle[i + 1].PartOfSpeech)
                {
                    i++;
                    strValue += " " + tokenizedArticle[i].Value;
                    if (tokenizedArticle[i].Frequency > tempWs)
                    {
                        tempWs = tokenizedArticle[i].Frequency;
                    }
                }

                int endIndex = i;

                var newToken = new Candidate(strValue, tokenizedArticle[startIndex].Position, tokenizedArticle[endIndex].Position - tokenizedArticle[startIndex].Position);
                newToken.Sentence = tokenizedArticle[i].Sentence;
                newToken.NamedEntity = tokenizedArticle[i].NamedEntity;
                newToken.PartOfSpeech = tokenizedArticle[i].PartOfSpeech;
                newToken.Frequency = tempWs;
                candidates.Add(newToken);

                //System.Console.WriteLine("CANDIDATE BY POS [{0}]: {1} (Position {2})", posTag, newToken.Value, newToken.Position);
            }
            return i;
        }
예제 #3
0
        private void getCandidateByMarkers(String[] generalStopWords, String[] startMarkers, String[][] endMarkers, String[][] enderMarkers, String[][] enderPOS, String[][] enderPOSType, int i, List<Candidate> candidates, List<Token> tokenizedArticle, Boolean isExclusive)
        {
            /*
            generalStopWords are words that stops a phrase from being a candidate for all start markers
            startMarkers starts the possibility of a phrase from being a candidate
            endMarkers determines the end of the candidate (different per startMarker)
            enderMarkers contains words that stops the phrase from being a candidate if found (different per startMarker)
            enderPOS contains POS that stops the phrase from being a candidate (different per startMarker)
            enderPOSType contains POS Type e.g VB for all verbs that stops the phrase from being a candidate (different per startMarker)
            */
            for (int j = 0; j < startMarkers.Length; j++)
            {
                if (tokenizedArticle[i].Value.Equals(startMarkers[j], StringComparison.OrdinalIgnoreCase))
                {
                    int sentenceNumber = tokenizedArticle[i].Sentence;
                    String strValue = null;
                    String posValue = null;
                    if (!isExclusive)
                    {
                        strValue = tokenizedArticle[i].Value;
                        posValue = tokenizedArticle[i].PartOfSpeech;
                    }
                    int tempWs = 0;
                    Boolean flag = true;
                    Boolean endMarkerFound = false;
                    i++;
                    int startIndex = i;
                    while (flag)
                    {
                        foreach (String markers in endMarkers[j])
                        {
                            if (tokenizedArticle[i].Value.Equals(markers))
                            {
                                endMarkerFound = true;
                                flag = false;
                                break;
                            }
                        }
                        if (generalStopWords != null)
                        {
                            foreach (String stopWords in generalStopWords)
                            {
                                if (tokenizedArticle[i].Value.Equals(stopWords, StringComparison.OrdinalIgnoreCase))
                                {
                                    flag = false;
                                    break;
                                }
                            }
                        }
                        if (enderPOS != null)
                        {
                            foreach (String POS in enderPOS[j])
                            {
                                if (tokenizedArticle[i].PartOfSpeech.Equals(POS, StringComparison.OrdinalIgnoreCase))
                                {
                                    flag = false;
                                    break;
                                }
                            }
                        }
                        if (enderPOSType != null)
                        {
                            if (tokenizedArticle[i].PartOfSpeech == null)
                            {
                                flag = false;
                                break;
                            }
                            foreach (String Type in enderPOSType[j])
                            {
                                if (tokenizedArticle[i].PartOfSpeech.StartsWith(Type))
                                {
                                    flag = false;
                                    break;
                                }
                            }
                        }
                        if (enderMarkers != null)
                        {
                            foreach (String markers in enderMarkers[j])
                            {
                                if (tokenizedArticle[i].Value.Equals(markers, StringComparison.OrdinalIgnoreCase))
                                {
                                    flag = false;
                                    break;
                                }
                            }
                        }
                        if (tokenizedArticle[i].Sentence != sentenceNumber)
                        {
                            flag = false;
                        }
                        i++;
                        if (i >= tokenizedArticle.Count)
                        {
                            flag = false;
                        }
                    }

                    int endIndex;
                    if (isExclusive)
                    {
                        endIndex = i - 1;
                    }
                    else
                    {
                        endIndex = i;
                    }
                    if (endMarkerFound)
                    {
                        for (int k = startIndex; k < endIndex; k++)
                        {
                            if (strValue == null)
                            {
                                strValue = tokenizedArticle[k].Value;
                                posValue = tokenizedArticle[k].PartOfSpeech;
                            }
                            else
                            {
                                strValue += (tokenizedArticle[k].Value.Equals(",") || tokenizedArticle[k].Value.Equals(".") ? "" : " ") + tokenizedArticle[k].Value;
                                posValue += " " + tokenizedArticle[k].PartOfSpeech;
                            }

                            if (tokenizedArticle[k].Frequency > tempWs)
                            {
                                tempWs = tokenizedArticle[k].Frequency;
                            }
                        }
                        if (strValue != null) {
                            var newToken = new Candidate(strValue, tokenizedArticle[startIndex].Position, tokenizedArticle[endIndex].Position - tokenizedArticle[startIndex].Position);
                            newToken.Sentence = tokenizedArticle[startIndex].Sentence;
                            newToken.NamedEntity = tokenizedArticle[endIndex].NamedEntity;
                            newToken.PartOfSpeech = tokenizedArticle[endIndex].PartOfSpeech;
                            newToken.Frequency = tempWs;
                            candidates.Add(newToken);
                        }

                        //System.Console.WriteLine("CANDIDATE BY MARKERS: {0}"/*\n\t{1}*/, newToken.Value/*, posValue*/);
                    }
                    else
                    {
                        i = startIndex - 1;
                    }
                    j = startMarkers.Length;
                }
            }
        }
예제 #4
0
        private void getCandidateByGazette(String[] gazette, int i, List<Candidate> candidates, List<Token> tokenizedArticle)
        {
            if (i < tokenizedArticle.Count && tokenizedArticle[i].Sentence <= 3)
            {
                if(gazette.Contains(tokenizedArticle[i].Value))
                {
                    var newToken = new Candidate(tokenizedArticle[i].Value, tokenizedArticle[i].Position, 1);
                    newToken.Sentence = tokenizedArticle[i].Sentence;
                    newToken.NamedEntity = tokenizedArticle[i].NamedEntity;
                    newToken.PartOfSpeech = tokenizedArticle[i].PartOfSpeech;
                    newToken.Frequency = tokenizedArticle[i].Frequency;
                    candidates.Add(newToken);

                    //System.Console.WriteLine("CANDIDATE BY GAZETTER: {0} (Position {1})", newToken.Value, newToken.Position);
                }
            }
        }
예제 #5
0
        private float[] performMultipleAnnotationAssignment(String annotationType)
        {
            float[] statistics = new float[3] { 0, 0, 0 }; //[0] = recall, [1] = precision, [2]  total
            int totalMatch = 0;
            annotationType = annotationType.ToUpper();
            if (annotationType != "WHO" && annotationType != "WHEN" && annotationType != "WHERE")
            {
                return statistics;
            }

            String strAnnotation = "";
            Action<string> assignmentMethod = null;
            string[] arrAnnotations = null;
            bool foundMatchingCandidate = false;
            switch (annotationType)
            {
                case "WHO":
                    strAnnotation = annotationCurrent.Who;
                    if (strAnnotation != null)
                    {
                        //System.Console.WriteLine("WHO Annotation: " + strAnnotation);
                        assignmentMethod = annotation =>
                        {
                            foreach (var candidate in listWhoCandidates)
                            {
                                if (candidate.Value == annotation)
                                {
                                    candidate.IsWho = true;
                                    foundMatchingCandidate = true;
                                    //System.Console.WriteLine("WHO\nBEFORE: " + (((candidate.Position - 2) >= 0) ? listLatestTokenizedArticle[candidate.Position - 2].Value : "N/A"));
                                    //string[] temp = candidate.Value.Split(' ');
                                    //System.Console.WriteLine("AFTER: " + (((candidate.Position + temp.Length - 1) <= listLatestTokenizedArticle.Count()) ? listLatestTokenizedArticle[candidate.Position + temp.Length - 1].Value : "N/A"));
                                    break;
                                }
                                else if (annotation.Contains(candidate.Value))
                                {

                                    //System.Console.WriteLine("'WHO' Under-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else if (candidate.Value.Contains(annotation))
                                {
                                    //System.Console.WriteLine("'WHO' Over-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else
                                {
                                    //System.Console.WriteLine("'WHO' Complete Mismatch: " + candidate.Value + " - " + annotation);
                                }
                            }
                        };
                    }
                    break;
                case "WHEN":
                    strAnnotation = annotationCurrent.When;
                    if (strAnnotation != null)
                    {
                        //System.Console.WriteLine("WHEN Annotation: " + strAnnotation);
                        assignmentMethod = annotation =>
                        {
                            foreach (var candidate in listWhenCandidates)
                            {
                                if (candidate.Value == annotation)
                                {
                                    candidate.IsWhen = true;
                                    foundMatchingCandidate = true;
                                    //System.Console.WriteLine("WHEN\nBEFORE: " + (((candidate.Position - 2) >= 0) ? listLatestTokenizedArticle[candidate.Position - 2].Value : "N/A"));
                                    //string[] temp = candidate.Value.Split(' ');
                                    //System.Console.WriteLine("AFTER: " + (((candidate.Position + temp.Length - 1) <= listLatestTokenizedArticle.Count()) ? listLatestTokenizedArticle[candidate.Position + temp.Length - 1].Value : "N/A"));
                                    break;
                                }
                                else if (annotation.Contains(candidate.Value))
                                {
                                    //System.Console.WriteLine("'WHEN' Under-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else if (candidate.Value.Contains(annotation))
                                {
                                    //System.Console.WriteLine("'WHEN' Over-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else
                                {
                                    //System.Console.WriteLine("'WHEN' Complete Mismatch: " + candidate.Value + " - " + annotation);
                                }
                            }
                        };
                    }
                    break;
                case "WHERE":
                    strAnnotation = annotationCurrent.Where;
                    if (strAnnotation != null)
                    {
                        //System.Console.WriteLine("WHERE Annotation: " + strAnnotation);
                        assignmentMethod = annotation =>
                        {
                            foreach (var candidate in listWhereCandidates)
                            {
                                if (candidate.Value == annotation)
                                {
                                    candidate.IsWhere = true;
                                    foundMatchingCandidate = true;
                                    //System.Console.WriteLine("WHERE\nBEFORE: " + (((candidate.Position - 2) >= 0) ? listLatestTokenizedArticle[candidate.Position - 2].Value : "N/A"));
                                    //string[] temp = candidate.Value.Split(' ');
                                    //System.Console.WriteLine("AFTER: " + (((candidate.Position + temp.Length - 1) <= listLatestTokenizedArticle.Count()) ? listLatestTokenizedArticle[candidate.Position + temp.Length - 1].Value : "N/A"));
                                    break;
                                }
                                else if (annotation.Contains(candidate.Value))
                                {
                                    //System.Console.WriteLine("'WHERE' Under-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else if (candidate.Value.Contains(annotation))
                                {
                                    //System.Console.WriteLine("'WHERE' Over-extracted: " + candidate.Value + " - " + annotation);
                                }
                                else
                                {
                                    //System.Console.WriteLine("'WHERE' Complete Mismatch: " + candidate.Value + " - " + annotation);
                                }
                            }
                        };
                    }
                    break;
            }

            if (strAnnotation.Count() <= 0 || strAnnotation == "N/A")
            {
                return statistics;
            }

            arrAnnotations = strAnnotation.Split(';');

            for (int r = 0; r < arrAnnotations.Length; r++)
            {
                if (arrAnnotations[r].Length > 0 && arrAnnotations[r][0] == ' ')
                {
                    arrAnnotations[r] = arrAnnotations[r].Substring(1);
                }

                ////System.Console.WriteLine(annotationType + " ANNOTATIONS-" + arrAnnotations[r]);
                if (assignmentMethod != null)
                {
                    assignmentMethod(arrAnnotations[r]);
                }

                if (!foundMatchingCandidate)
                {
                    int i = -1;
                    String[] wordForWordAnnotation = arrAnnotations[r].Split(' ');
                    for (int ctr = 0; ctr < listLatestTokenizedArticle.Count; ctr++)
                    {
                        if (wordForWordAnnotation[0].Contains(listLatestTokenizedArticle[ctr].Value))
                        {
                            i = ctr;
                            break;
                        }
                    }

                    if (i > -1)
                    {
                        //add as candidate
                        int startIndex = i;
                        int tempWs = listLatestTokenizedArticle[i].Frequency;

                        for (int ctr = startIndex; ctr < startIndex + wordForWordAnnotation.Count(); ctr++)
                        {
                            if (ctr < listLatestTokenizedArticle.Count && listLatestTokenizedArticle[ctr].Frequency > tempWs)
                            {
                                tempWs = listLatestTokenizedArticle[ctr].Frequency;
                            }
                        }

                        var newToken = new Candidate(arrAnnotations[r], listLatestTokenizedArticle[startIndex].Position, startIndex + wordForWordAnnotation.Count() - 1);
                        newToken.Sentence = listLatestTokenizedArticle[i].Sentence;
                        newToken.NamedEntity = listLatestTokenizedArticle[i].NamedEntity;
                        newToken.PartOfSpeech = listLatestTokenizedArticle[i].PartOfSpeech;
                        newToken.Frequency = tempWs;
                        switch (annotationType)
                        {
                            case "WHO":
                                newToken.IsWho = true;
                                listWhoCandidates.Add(newToken);
                                break;
                            case "WHEN":
                                newToken.IsWhen = true;
                                listWhenCandidates.Add(newToken);
                                break;
                            case "WHERE":
                                newToken.IsWhere = true;
                                listWhereCandidates.Add(newToken);
                                break;
                        }
                    }
                }
                else
                {
                    totalMatch += 1;
                    foundMatchingCandidate = false;
                }
            }

            //System.Console.WriteLine("Annotations Count: {0}", arrAnnotations.GetLength(0));
            statistics[2] += 1;
            statistics[0] = (float)totalMatch / arrAnnotations.GetLength(0);
            switch (annotationType)
            {
                case "WHO":
                    statistics[1] = (float)totalMatch / listWhoCandidates.Count;
                    //System.Console.WriteLine("Total Match: {0}, Who Candidates Count: {1}", totalMatch, listWhoCandidates.Count);
                    break;
                case "WHEN":
                    statistics[1] = (float)totalMatch / listWhenCandidates.Count;
                    //System.Console.WriteLine("Total Match: {0}, When Candidates Count: {1}", totalMatch, listWhenCandidates.Count);
                    break;
                case "WHERE":
                    statistics[1] = (float)totalMatch / listWhereCandidates.Count;
                    //System.Console.WriteLine("Total Match: {0}, Where Candidates Count: {1}", totalMatch, listWhereCandidates.Count);
                    break;
            }
            return statistics;
        }
예제 #6
0
        private void labelWhy()
        {
            double WEIGHT_PER_MARKER = 0.5;
            double WEIGHT_PER_WHAT = 0.5;
            double CARRY_OVER = 0;

            String[][] markers = new String[][] {
                new String[] { " sanhi sa ", "START" },
                new String[] { " sanhi ng ", "START" },
                new String[] { " sapagkat ", "START" },
                new String[] { " palibhasa ay ", "START" },
                new String[] { " palibhasa ", "START" },
                new String[] { " kasi ", "START" },
                new String[] { " mangyari'y ", "START" },
                new String[] { " mangyari ay ", "START" },
                new String[] { " dahil sa ", "START" },
                new String[] { " dahil na rin sa ", "START" },
                new String[] { " dahil ", "START" },
                new String[] { " dahilan sa", "START" },
                new String[] { " dahilan ", "START" },
                new String[] { " para ", "START" },
                new String[] { " upang ", "START" },
                new String[] { " makaraang ", "START" },
                new String[] { " naglalayong ", "START" },
                new String[] { " kaya ", "END" }
            };

            List<double> candidateWeights = new List<double>();

            if (listWhyCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhyCandidates)
                {
                    String tempWhy = "";
                    String copyWhy = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhy = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhy = tempWhy.Replace("-LRB- ", "(");
                    tempWhy = tempWhy.Replace(" -RRB-", ")");
                    tempWhy = tempWhy.Replace(" . ", ".");
                    tempWhy = tempWhy.Replace(" .", ".");
                    tempWhy = tempWhy.Replace(" ,", ",");
                    tempWhy = tempWhy.Replace(" !", "!");

                    copyWhy = tempWhy;

                    if (tempWhy.Contains(strWhat))
                    {
                        tempWeight += WEIGHT_PER_WHAT;
                    }

                    match = markers.FirstOrDefault(s => tempWhy.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhy = (match[1].Equals("START")) ?
                            tempWhy.Substring(tempWhy.IndexOf(match[0]) + match[0].Count()) :
                            tempWhy.Substring(0, tempWhy.IndexOf(match[0]));
                        tempWeight += WEIGHT_PER_MARKER;
                    }

                    tempWeight += CARRY_OVER;
                    CARRY_OVER = 0;

                    if (strWhat.Contains(tempWhy))
                    {
                        tempWeight = 0;
                    }

                    if (strWhat.Equals(tempWhy))
                    {
                        CARRY_OVER = 0.5;
                    }

                    int position = candidate[0].Position + copyWhy.Substring(0, copyWhy.IndexOf(tempWhy)).Split(' ').Count() - 1;
                    int length = tempWhy.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhy, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhy.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.Why, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhy = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhyCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.Why.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.Why);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.Why, 0, annotationCurrent.Why.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count - 2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        if (annotationCurrent.Why.Contains(annotationCurrent.What))
                        {
                            tempWeight += WEIGHT_PER_WHAT;
                        }

                        String[] match = markers.FirstOrDefault(s => annotationCurrent.Why.Contains(s[0]));

                        if (match != null)
                        {
                            tempWeight += WEIGHT_PER_MARKER;
                        }

                        tempWeight += CARRY_OVER;
                        CARRY_OVER = 0;

                        if (annotationCurrent.What.Contains(annotationCurrent.Why))
                        {
                            tempWeight = 0;
                        }

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.Why.Contains).Count();

                        listSecondaryWhyCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("why", articleCurrent, listSecondaryWhyCandidates);
                }

                listSecondaryWhyCandidates = new List<Candidate>();
            }

            /*Instances whyInstances = createWhyInstances();

            foreach (Instance instance in whyInstances)
            {
                double[] classProbability = whyClassifier.distributionForInstance(instance);
                if (classProbability[0] >= classProbability[1])
                {
                    strWhy = instance.stringValue(0);
                    break;
                }
            }*/
        }
예제 #7
0
        private void labelWhat()
        {
            double WEIGHT_PER_WHO = 0.3;
            double WEIGHT_PER_WHEN = 0.2;
            double WEIGHT_PER_WHERE = 0.2;
            double WEIGHT_PER_SENTENCE = 0.2;
            double WEIGHT_PER_W_IN_TITLE = 0.1;

            List<double> candidateWeights = new List<double>();
            double highestWeight = -1;

            String[][] markers = new String[][] {
                new String[] { "kaya", "START" },
                new String[] { "para", "END" },
                new String[] { "dahil", "END" },
                new String[] { "upang", "END" },
                new String[] { "makaraang", "END" },
            };

            if (listWhatCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhatCandidates)
                {
                    String tempWhat = "";
                    String copyWhat = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhat = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhat = tempWhat.Replace("-LRB- ", "(");
                    tempWhat = tempWhat.Replace(" -RRB-", ")");
                    tempWhat = tempWhat.Replace(" . ", ".");
                    tempWhat = tempWhat.Replace(" .", ".");
                    tempWhat = tempWhat.Replace(" ,", ",");
                    tempWhat = tempWhat.Replace(" !", "!");

                    copyWhat = tempWhat;

                    tempWeight += listWho.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHO;
                    tempWeight += listWhen.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHEN;
                    tempWeight += listWhere.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHERE;
                    tempWeight += 1 - WEIGHT_PER_SENTENCE * candidate[0].Sentence;

                    tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                    candidateWeights.Add(tempWeight);

                    match = markers.FirstOrDefault(s => tempWhat.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhat = (match[1].Equals("START")) ?
                            tempWhat.Substring(tempWhat.IndexOf(match[0]) + match[0].Count() + 1) :
                            tempWhat.Substring(0, tempWhat.IndexOf(match[0]));
                    }

                    int position = candidate[0].Position + copyWhat.Substring(0, copyWhat.IndexOf(tempWhat)).Split(' ').Count() - 1;
                    int length = tempWhat.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhat, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhat.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.What, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhat = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhatCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.What.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.What);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.What, 0, annotationCurrent.What.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count-2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        tempWeight += listWho.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHO;
                        tempWeight += listWhen.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHEN;
                        tempWeight += listWhere.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHERE;
                        tempWeight += 1 - WEIGHT_PER_SENTENCE * sentenceNumber;

                        tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.What.Contains).Count();

                        listSecondaryWhatCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("what", articleCurrent, listSecondaryWhatCandidates);
                }

                listSecondaryWhatCandidates = new List<Candidate>();

                /*Instances whatInstances = createWhatInstances();

                foreach (Instance instance in whatInstances)
                {
                    double[] classProbability = whatClassifier.distributionForInstance(instance);
                    if (classProbability[0] >= classProbability[1])
                    {
                        strWhat = instance.stringValue(0);
                        break;
                    }
                }*/
            }
        }