Ejemplo n.º 1
0
        private void labelWhy()
        {
            double WEIGHT_PER_MARKER = 0.5;
            double WEIGHT_PER_WHAT = 0.5;
            double CARRY_OVER = 0;

            String[][] markers = new String[][] {
                new String[] { " sanhi sa ", "START" },
                new String[] { " sanhi ng ", "START" },
                new String[] { " sapagkat ", "START" },
                new String[] { " palibhasa ay ", "START" },
                new String[] { " palibhasa ", "START" },
                new String[] { " kasi ", "START" },
                new String[] { " mangyari'y ", "START" },
                new String[] { " mangyari ay ", "START" },
                new String[] { " dahil sa ", "START" },
                new String[] { " dahil na rin sa ", "START" },
                new String[] { " dahil ", "START" },
                new String[] { " dahilan sa", "START" },
                new String[] { " dahilan ", "START" },
                new String[] { " para ", "START" },
                new String[] { " upang ", "START" },
                new String[] { " makaraang ", "START" },
                new String[] { " naglalayong ", "START" },
                new String[] { " kaya ", "END" }
            };

            List<double> candidateWeights = new List<double>();

            if (listWhyCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhyCandidates)
                {
                    String tempWhy = "";
                    String copyWhy = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhy = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhy = tempWhy.Replace("-LRB- ", "(");
                    tempWhy = tempWhy.Replace(" -RRB-", ")");
                    tempWhy = tempWhy.Replace(" . ", ".");
                    tempWhy = tempWhy.Replace(" .", ".");
                    tempWhy = tempWhy.Replace(" ,", ",");
                    tempWhy = tempWhy.Replace(" !", "!");

                    copyWhy = tempWhy;

                    if (tempWhy.Contains(strWhat))
                    {
                        tempWeight += WEIGHT_PER_WHAT;
                    }

                    match = markers.FirstOrDefault(s => tempWhy.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhy = (match[1].Equals("START")) ?
                            tempWhy.Substring(tempWhy.IndexOf(match[0]) + match[0].Count()) :
                            tempWhy.Substring(0, tempWhy.IndexOf(match[0]));
                        tempWeight += WEIGHT_PER_MARKER;
                    }

                    tempWeight += CARRY_OVER;
                    CARRY_OVER = 0;

                    if (strWhat.Contains(tempWhy))
                    {
                        tempWeight = 0;
                    }

                    if (strWhat.Equals(tempWhy))
                    {
                        CARRY_OVER = 0.5;
                    }

                    int position = candidate[0].Position + copyWhy.Substring(0, copyWhy.IndexOf(tempWhy)).Split(' ').Count() - 1;
                    int length = tempWhy.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhy, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhy.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.Why, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhy = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhyCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.Why.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.Why);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.Why, 0, annotationCurrent.Why.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count - 2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        if (annotationCurrent.Why.Contains(annotationCurrent.What))
                        {
                            tempWeight += WEIGHT_PER_WHAT;
                        }

                        String[] match = markers.FirstOrDefault(s => annotationCurrent.Why.Contains(s[0]));

                        if (match != null)
                        {
                            tempWeight += WEIGHT_PER_MARKER;
                        }

                        tempWeight += CARRY_OVER;
                        CARRY_OVER = 0;

                        if (annotationCurrent.What.Contains(annotationCurrent.Why))
                        {
                            tempWeight = 0;
                        }

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.Why.Contains).Count();

                        listSecondaryWhyCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("why", articleCurrent, listSecondaryWhyCandidates);
                }

                listSecondaryWhyCandidates = new List<Candidate>();
            }

            /*Instances whyInstances = createWhyInstances();

            foreach (Instance instance in whyInstances)
            {
                double[] classProbability = whyClassifier.distributionForInstance(instance);
                if (classProbability[0] >= classProbability[1])
                {
                    strWhy = instance.stringValue(0);
                    break;
                }
            }*/
        }
Ejemplo n.º 2
0
        private void labelWhy()
        {
            double WEIGHT_PER_MARKER = 0.5;
            //double WEIGHT_PER_VERB_MARKER = 0.2;
            double WEIGHT_PER_WHAT = 0.5;
            //double WEIGHT_PER_CHAR = 0.01;
            //double WEIGHT_PER_SENTENCE = 0;
            double CARRY_OVER = 0;

            String[][] markers = new String[][] {
                new String[] { " sanhi sa ", "START" },
                new String[] { " sanhi ng ", "START" },
                new String[] { " sapagkat ", "START" },
                new String[] { " palibhasa ay ", "START" },
                new String[] { " palibhasa ", "START" },
                new String[] { " kasi ", "START" },
                new String[] { " mangyari'y ", "START" },
                new String[] { " mangyari ay ", "START" },
                new String[] { " dahil sa ", "START" },
                new String[] { " dahil na rin sa ", "START" },
                new String[] { " dahil ", "START" },
                new String[] { " dahilan sa", "START" },
                new String[] { " dahilan ", "START" },
                new String[] { " para ", "START" },
                new String[] { " upang ", "START" },
                new String[] { " makaraang ", "START" },
                new String[] { " naglalayong ", "START" },
                new String[] { " kaya ", "END" }
            };

            string[] endMarkers = new string[]
            {
                " makaraang ",
                ", ",
            };

            String[] verbMarkers = new String[]
            {
                "pag-usapan",
                "sinabi",
                "pinalalayo",
                "itatatag",
                "sinisi",
                "nakipag-ugnayan",
                "nagsampa",
                "hiniling"
            };

            List <double> candidateWeights = new List <double>();
            double        highestWeight    = 0.5;

            if (listWhyCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List <Token> candidate in listWhyCandidates)
                {
                    String   tempWhy    = "";
                    String   copyWhy    = "";
                    double   tempWeight = 0;
                    String[] match;
                    bool     hasWhat   = false;
                    bool     hasMarker = false;

                    tempWhy = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhy = tempWhy.Replace("-LRB- ", "(");
                    tempWhy = tempWhy.Replace(" -RRB-", ")");
                    tempWhy = tempWhy.Replace(" . ", ".");
                    tempWhy = tempWhy.Replace(" .", ".");
                    tempWhy = tempWhy.Replace(" ,", ",");
                    tempWhy = tempWhy.Replace(" !", "!");

                    copyWhy = tempWhy;

                    if (tempWhy.Contains(strWhat))
                    {
                        tempWeight += WEIGHT_PER_WHAT;
                        hasWhat     = true;
                    }

                    match = markers.FirstOrDefault(s => tempWhy.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhy = (match[1].Equals("START")) ?
                                  tempWhy.Substring(tempWhy.IndexOf(match[0]) + match[0].Count()) :
                                  tempWhy.Substring(0, tempWhy.IndexOf(match[0]));
                        tempWeight += WEIGHT_PER_MARKER;
                        hasMarker   = true;

                        //if (match[1].Equals("START"))
                        //{
                        //    string endMatch = endMarkers.FirstOrDefault(s => tempWhy.Contains(s));

                        //    if (endMatch != null)
                        //    {
                        //        tempWhy = tempWhy.Substring(0, tempWhy.IndexOf(endMatch));
                        //    }
                        //}
                    }

                    tempWeight += CARRY_OVER;
                    CARRY_OVER  = 0;

                    if (strWhat.Contains(tempWhy))
                    {
                        tempWeight = 0;
                    }

                    //if(verbMarkers.Any(s => strWhat.ToLower().Contains(s)))
                    //{
                    //    tempWeight += WEIGHT_PER_VERB_MARKER;
                    //}

                    if (strWhat.Equals(tempWhy))
                    {
                        CARRY_OVER = 0.5;
                    }

                    /*System.Console.WriteLine("---------");
                     * System.Console.WriteLine("Candidate: \t{0}\nMarker: \t{1}\nWeight: \t{2}",
                     *  tempWhy,
                     *  match != null ? match[0] : "N/A",
                     *  tempWeight);
                     */
                    //candidateWeights.Add(tempWeight);

                    //if (tempWeight > highestWeight)
                    //{
                    //    strWhy = tempWhy;
                    //    highestWeight = tempWeight;
                    //}
                    int position = candidate[0].Position + copyWhy.Substring(0, copyWhy.IndexOf(tempWhy)).Split(' ').Count() - 1;
                    int length   = tempWhy.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhy, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score    = tempWeight;
                    newCandidate.NumWho   = listWho.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhen  = listWhen.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhy.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx             = new Regex("[^a-zA-Z0-9]");
                        var   candidateValue  = rgx.Replace(newCandidate.Value, "");
                        var   annotationValue = rgx.Replace(annotationCurrent.Why, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhy = true;
                            foundMatching      = true;
                        }
                    }

                    listSecondaryWhyCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.Why.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List <Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.Why);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.Why, 0, annotationCurrent.Why.Split(' ').Count()
                        );

                    int sentenceNumber = -1;
                    int position       = -1;

                    for (int i = 0; i < articleCurrent.Count - 2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                            (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                            (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position       = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        if (annotationCurrent.Why.Contains(annotationCurrent.What))
                        {
                            tempWeight += WEIGHT_PER_WHAT;
                        }

                        String[] match = markers.FirstOrDefault(s => annotationCurrent.Why.Contains(s[0]));

                        if (match != null)
                        {
                            tempWeight += WEIGHT_PER_MARKER;
                        }

                        tempWeight += CARRY_OVER;
                        CARRY_OVER  = 0;

                        if (annotationCurrent.What.Contains(annotationCurrent.Why))
                        {
                            tempWeight = 0;
                        }

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score    = tempWeight;
                        newCandidate.NumWho   = listWho.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhen  = listWhen.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.Why.Contains).Count();

                        listSecondaryWhyCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wt.train("why", articleCurrent, listSecondaryWhyCandidates);
                }
            }

            Instances whyInstances = createWhyInstances();

            foreach (Instance instance in whyInstances)
            {
                double[] classProbability = whyClassifier.distributionForInstance(instance);
                if (classProbability[0] >= classProbability[1])
                {
                    strWhy = instance.stringValue(0);
                    break;
                }
            }

            listSecondaryWhyCandidates = new List <Candidate>();
        }
Ejemplo n.º 3
0
        private void labelWhat()
        {
            double WEIGHT_PER_WHO = 0.3;
            double WEIGHT_PER_WHEN = 0.2;
            double WEIGHT_PER_WHERE = 0.2;
            double WEIGHT_PER_SENTENCE = 0.2;
            double WEIGHT_PER_W_IN_TITLE = 0.1;

            List<double> candidateWeights = new List<double>();
            double highestWeight = -1;

            String[][] markers = new String[][] {
                new String[] { "kaya", "START" },
                new String[] { "para", "END" },
                new String[] { "dahil", "END" },
                new String[] { "upang", "END" },
                new String[] { "makaraang", "END" },
            };

            if (listWhatCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhatCandidates)
                {
                    String tempWhat = "";
                    String copyWhat = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhat = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhat = tempWhat.Replace("-LRB- ", "(");
                    tempWhat = tempWhat.Replace(" -RRB-", ")");
                    tempWhat = tempWhat.Replace(" . ", ".");
                    tempWhat = tempWhat.Replace(" .", ".");
                    tempWhat = tempWhat.Replace(" ,", ",");
                    tempWhat = tempWhat.Replace(" !", "!");

                    copyWhat = tempWhat;

                    tempWeight += listWho.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHO;
                    tempWeight += listWhen.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHEN;
                    tempWeight += listWhere.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHERE;
                    tempWeight += 1 - WEIGHT_PER_SENTENCE * candidate[0].Sentence;

                    tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                    candidateWeights.Add(tempWeight);

                    match = markers.FirstOrDefault(s => tempWhat.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhat = (match[1].Equals("START")) ?
                            tempWhat.Substring(tempWhat.IndexOf(match[0]) + match[0].Count() + 1) :
                            tempWhat.Substring(0, tempWhat.IndexOf(match[0]));
                    }

                    int position = candidate[0].Position + copyWhat.Substring(0, copyWhat.IndexOf(tempWhat)).Split(' ').Count() - 1;
                    int length = tempWhat.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhat, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhat.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.What, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhat = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhatCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.What.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.What);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.What, 0, annotationCurrent.What.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count-2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        tempWeight += listWho.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHO;
                        tempWeight += listWhen.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHEN;
                        tempWeight += listWhere.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHERE;
                        tempWeight += 1 - WEIGHT_PER_SENTENCE * sentenceNumber;

                        tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.What.Contains).Count();

                        listSecondaryWhatCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("what", articleCurrent, listSecondaryWhatCandidates);
                }

                listSecondaryWhatCandidates = new List<Candidate>();

                /*Instances whatInstances = createWhatInstances();

                foreach (Instance instance in whatInstances)
                {
                    double[] classProbability = whatClassifier.distributionForInstance(instance);
                    if (classProbability[0] >= classProbability[1])
                    {
                        strWhat = instance.stringValue(0);
                        break;
                    }
                }*/
            }
        }