Ejemplo n.º 1
0
        private Boolean extract(String destinationPath, String invertedDestinationPath, String formatDateDestinationPath)
        {
            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]);
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            //List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();

            //listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]);

            if (listCurrentArticles != null && listCurrentArticles.Count > 0)
            {
                Preprocessor preprocessor = new Preprocessor();
                float precisionWho = 0;
                float recallWho = 0;
                float precisionWhen = 0;
                float recallWhen = 0;
                float precisionWhere = 0;
                float recallWhere = 0;
                float precisionWhat = 0;
                float recallWhat = 0;
                float precisionWhy = 0;
                float recallWhy = 0;
                float totalWho = 0;
                float totalWhen = 0;
                float totalWhere = 0;
                float totalWhat = 0;
                float totalWhy = 0;
                float sentenceZeroWhat = 0;
                float sentenceOneWhat = 0;
                float sentenceTwoWhat = 0;
                float sentenceThreeWhat = 0;
                float sentenceFourWhat = 0;
                float sentenceFiveWhat = 0;
                float sentenceZeroWhy = 0;
                float sentenceOneWhy = 0;
                float sentenceTwoWhy = 0;
                float sentenceThreeWhy = 0;
                float sentenceFourWhy = 0;
                float sentenceFiveWhy = 0;

                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    float[][] statistics;
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());

                    /*preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                    statistics = preprocessor.performAnnotationAssignment();

                    if (statistics != null)
                    {
                        recallWho += statistics[0][0];
                        recallWhen += statistics[1][0];
                        recallWhere += statistics[2][0];
                        recallWhat += statistics[3][0];
                        recallWhy += statistics[4][0];
                        precisionWho += statistics[0][1];
                        precisionWhen += statistics[1][1];
                        precisionWhere += statistics[2][1];
                        precisionWhat += statistics[3][1];
                        precisionWhy += statistics[4][1];
                        totalWho += statistics[0][2];
                        totalWhen += statistics[1][2];
                        totalWhere += statistics[2][2];
                        totalWhat += statistics[3][2];
                        totalWhy += statistics[4][2];
                        int sentenceNumber = (int)statistics[3][3];
                        switch (sentenceNumber)
                        {
                            case -1:
                                break;
                            case 0:
                                sentenceZeroWhat += 1;
                                break;
                            case 1:
                                sentenceOneWhat += 1;
                                break;
                            case 2:
                                sentenceTwoWhat += 1;
                                break;
                            case 3:
                                sentenceThreeWhat += 1;
                                break;
                            case 4:
                                sentenceFourWhat += 1;
                                break;
                            case 5:
                                sentenceFiveWhat += 1;
                                break;
                            default:
                                sentenceFiveWhat += 1;
                                break;
                        }
                        sentenceNumber = (int)statistics[4][3];
                        switch (sentenceNumber)
                        {
                            case -1:
                                break;
                            case 0:
                                sentenceZeroWhy += 1;
                                break;
                            case 1:
                                sentenceOneWhy += 1;
                                break;
                            case 2:
                                sentenceTwoWhy += 1;
                                break;
                            case 3:
                                sentenceThreeWhy += 1;
                                break;
                            case 4:
                                sentenceFourWhy += 1;
                                break;
                            case 5:
                                sentenceFiveWhy += 1;
                                break;
                            default:
                                sentenceFiveWhy += 1;
                                break;
                        }
                    }

                    System.Console.WriteLine("Article #{0}", nI + 1);
                    System.Console.WriteLine("Recall Who: " + statistics[0][0]);
                    System.Console.WriteLine("Recall When: " + statistics[1][0]);
                    System.Console.WriteLine("Recall Where: " + statistics[2][0]);
                    System.Console.WriteLine("Recall What: " + statistics[3][0]);
                    System.Console.WriteLine("Recall Why: " + statistics[4][0]);
                    System.Console.WriteLine("Precision Who: " + statistics[0][1]);
                    System.Console.WriteLine("Precision When: " + statistics[1][1]);
                    System.Console.WriteLine("Precision Where: " + statistics[2][1]);
                    System.Console.WriteLine("Precision What: " + statistics[3][1]);
                    System.Console.WriteLine("Precision Why: " + statistics[4][1]);*/
                }

                //System.Console.WriteLine("Average Statistics");
                //System.Console.WriteLine("Recall Who: " + recallWho / totalWho);
                //System.Console.WriteLine("Recall When: " + recallWhen / totalWhen);
                //System.Console.WriteLine("Recall Where: " + recallWhere / totalWhere);
                //System.Console.WriteLine("Recall What: " + recallWhat / totalWhat);
                //System.Console.WriteLine("Recall Why: " + recallWhy / totalWhy);
                //System.Console.WriteLine("Precision Who: " + precisionWho / totalWho);
                //System.Console.WriteLine("Precision When: " + precisionWhen / totalWhere);
                //System.Console.WriteLine("Precision Where: " + precisionWhere / totalWhen);
                //System.Console.WriteLine("Precision What: " + precisionWhat / totalWhat);
                //System.Console.WriteLine("Precision Why: " + precisionWhy / totalWhy);
                //System.Console.WriteLine("What sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhat + " Percentage: " + sentenceZeroWhat/ totalWhat);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhat + " Percentage: " + sentenceOneWhat / totalWhat);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhat + " Percentage: " + sentenceTwoWhat / totalWhat);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhat + " Percentage: " + sentenceThreeWhat / totalWhat);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhat + " Percentage: " + sentenceFourWhat / totalWhat);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhat + " Percentage: " + sentenceFiveWhat / totalWhat);
                //System.Console.WriteLine("Why sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhy + " Percentage: " + sentenceZeroWhy / totalWhy);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhy + " Percentage: " + sentenceOneWhy / totalWhy);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhy + " Percentage: " + sentenceTwoWhy / totalWhy);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhy + " Percentage: " + sentenceThreeWhy / totalWhy);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhy + " Percentage: " + sentenceFourWhy / totalWhy);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhy + " Percentage: " + sentenceFiveWhy / totalWhy);
            }
            else
            {
                MessageBox.Show("Invalid XML File!");
                return false;
            }

            Identifier annotationIdentifier = new Identifier(false, null);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }

            ResultWriter rw = new ResultWriter(destinationPath, invertedDestinationPath, formatDateDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();

            return true;
        }
Ejemplo n.º 2
0
        private Boolean extract(String destinationPath, String invertedDestinationPath, String formatDateDestinationPath)
        {
            List <Article>               listCurrentArticles     = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]);
            List <List <Token> >         listTokenizedArticles   = new List <List <Token> >();
            List <List <Candidate> >     listAllWhoCandidates    = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhenCandidates   = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhereCandidates  = new List <List <Candidate> >();
            List <List <List <Token> > > listAllWhatCandidates   = new List <List <List <Token> > >();
            List <List <List <Token> > > listAllWhyCandidates    = new List <List <List <Token> > >();
            List <List <String> >        listAllWhoAnnotations   = new List <List <String> >();
            List <List <String> >        listAllWhenAnnotations  = new List <List <String> >();
            List <List <String> >        listAllWhereAnnotations = new List <List <String> >();
            List <String> listAllWhatAnnotations = new List <String>();
            List <String> listAllWhyAnnotations  = new List <String>();

            //List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();

            //listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]);

            if (listCurrentArticles != null && listCurrentArticles.Count > 0)
            {
                Preprocessor preprocessor      = new Preprocessor();
                float        precisionWho      = 0;
                float        recallWho         = 0;
                float        precisionWhen     = 0;
                float        recallWhen        = 0;
                float        precisionWhere    = 0;
                float        recallWhere       = 0;
                float        precisionWhat     = 0;
                float        recallWhat        = 0;
                float        precisionWhy      = 0;
                float        recallWhy         = 0;
                float        totalWho          = 0;
                float        totalWhen         = 0;
                float        totalWhere        = 0;
                float        totalWhat         = 0;
                float        totalWhy          = 0;
                float        sentenceZeroWhat  = 0;
                float        sentenceOneWhat   = 0;
                float        sentenceTwoWhat   = 0;
                float        sentenceThreeWhat = 0;
                float        sentenceFourWhat  = 0;
                float        sentenceFiveWhat  = 0;
                float        sentenceZeroWhy   = 0;
                float        sentenceOneWhy    = 0;
                float        sentenceTwoWhy    = 0;
                float        sentenceThreeWhy  = 0;
                float        sentenceFourWhy   = 0;
                float        sentenceFiveWhy   = 0;

                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    float[][] statistics;
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());

                    /*preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                     * statistics = preprocessor.performAnnotationAssignment();
                     *
                     * if (statistics != null)
                     * {
                     *  recallWho += statistics[0][0];
                     *  recallWhen += statistics[1][0];
                     *  recallWhere += statistics[2][0];
                     *  recallWhat += statistics[3][0];
                     *  recallWhy += statistics[4][0];
                     *  precisionWho += statistics[0][1];
                     *  precisionWhen += statistics[1][1];
                     *  precisionWhere += statistics[2][1];
                     *  precisionWhat += statistics[3][1];
                     *  precisionWhy += statistics[4][1];
                     *  totalWho += statistics[0][2];
                     *  totalWhen += statistics[1][2];
                     *  totalWhere += statistics[2][2];
                     *  totalWhat += statistics[3][2];
                     *  totalWhy += statistics[4][2];
                     *  int sentenceNumber = (int)statistics[3][3];
                     *  switch (sentenceNumber)
                     *  {
                     *      case -1:
                     *          break;
                     *      case 0:
                     *          sentenceZeroWhat += 1;
                     *          break;
                     *      case 1:
                     *          sentenceOneWhat += 1;
                     *          break;
                     *      case 2:
                     *          sentenceTwoWhat += 1;
                     *          break;
                     *      case 3:
                     *          sentenceThreeWhat += 1;
                     *          break;
                     *      case 4:
                     *          sentenceFourWhat += 1;
                     *          break;
                     *      case 5:
                     *          sentenceFiveWhat += 1;
                     *          break;
                     *      default:
                     *          sentenceFiveWhat += 1;
                     *          break;
                     *  }
                     *  sentenceNumber = (int)statistics[4][3];
                     *  switch (sentenceNumber)
                     *  {
                     *      case -1:
                     *          break;
                     *      case 0:
                     *          sentenceZeroWhy += 1;
                     *          break;
                     *      case 1:
                     *          sentenceOneWhy += 1;
                     *          break;
                     *      case 2:
                     *          sentenceTwoWhy += 1;
                     *          break;
                     *      case 3:
                     *          sentenceThreeWhy += 1;
                     *          break;
                     *      case 4:
                     *          sentenceFourWhy += 1;
                     *          break;
                     *      case 5:
                     *          sentenceFiveWhy += 1;
                     *          break;
                     *      default:
                     *          sentenceFiveWhy += 1;
                     *          break;
                     *  }
                     * }
                     *
                     * System.Console.WriteLine("Article #{0}", nI + 1);
                     * System.Console.WriteLine("Recall Who: " + statistics[0][0]);
                     * System.Console.WriteLine("Recall When: " + statistics[1][0]);
                     * System.Console.WriteLine("Recall Where: " + statistics[2][0]);
                     * System.Console.WriteLine("Recall What: " + statistics[3][0]);
                     * System.Console.WriteLine("Recall Why: " + statistics[4][0]);
                     * System.Console.WriteLine("Precision Who: " + statistics[0][1]);
                     * System.Console.WriteLine("Precision When: " + statistics[1][1]);
                     * System.Console.WriteLine("Precision Where: " + statistics[2][1]);
                     * System.Console.WriteLine("Precision What: " + statistics[3][1]);
                     * System.Console.WriteLine("Precision Why: " + statistics[4][1]);*/
                }

                //System.Console.WriteLine("Average Statistics");
                //System.Console.WriteLine("Recall Who: " + recallWho / totalWho);
                //System.Console.WriteLine("Recall When: " + recallWhen / totalWhen);
                //System.Console.WriteLine("Recall Where: " + recallWhere / totalWhere);
                //System.Console.WriteLine("Recall What: " + recallWhat / totalWhat);
                //System.Console.WriteLine("Recall Why: " + recallWhy / totalWhy);
                //System.Console.WriteLine("Precision Who: " + precisionWho / totalWho);
                //System.Console.WriteLine("Precision When: " + precisionWhen / totalWhere);
                //System.Console.WriteLine("Precision Where: " + precisionWhere / totalWhen);
                //System.Console.WriteLine("Precision What: " + precisionWhat / totalWhat);
                //System.Console.WriteLine("Precision Why: " + precisionWhy / totalWhy);
                //System.Console.WriteLine("What sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhat + " Percentage: " + sentenceZeroWhat/ totalWhat);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhat + " Percentage: " + sentenceOneWhat / totalWhat);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhat + " Percentage: " + sentenceTwoWhat / totalWhat);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhat + " Percentage: " + sentenceThreeWhat / totalWhat);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhat + " Percentage: " + sentenceFourWhat / totalWhat);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhat + " Percentage: " + sentenceFiveWhat / totalWhat);
                //System.Console.WriteLine("Why sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhy + " Percentage: " + sentenceZeroWhy / totalWhy);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhy + " Percentage: " + sentenceOneWhy / totalWhy);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhy + " Percentage: " + sentenceTwoWhy / totalWhy);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhy + " Percentage: " + sentenceThreeWhy / totalWhy);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhy + " Percentage: " + sentenceFourWhy / totalWhy);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhy + " Percentage: " + sentenceFiveWhy / totalWhy);
            }
            else
            {
                MessageBox.Show("Invalid XML File!");
                return(false);
            }

            Identifier annotationIdentifier = new Identifier(false, null);

            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }

            ResultWriter rw = new ResultWriter(destinationPath, invertedDestinationPath, formatDateDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);

            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();

            return(true);
        }
Ejemplo n.º 3
0
        public static void Main()
        {
            /*#if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else*/
            Boolean isAnnotated = true;
            FileParser fileparserFP = new FileParser();
            String sourcePath = @"..\..\training_news.xml";
            String destinationPath = @"..\..\result.xml";
            String invertedDestinationPath = @"..\..\result_inverted_index.xml";
            String formatDateDestinationPath = @"..\..\result_format_date.xml";

            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePath);
            List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();
            if (isAnnotated)
            {
                 listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                    trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                    trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                    trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/
            /*try
            {
                var whoCandidatesPath = @"..\..\candidates_who.txt";
                var whenCandidatesPath = @"..\..\candidates_when.txt";
                var whereCandidatesPath = @"..\..\candidates_where.txt";

                if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
                if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
                if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);

                using (StreamWriter sw = File.CreateText(whoCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhoCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whenCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhenCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whereCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhereCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine("Error with writing initial line of training dataset.");
            }*/
            #endregion

            WhatWhyTrainer wwt = new WhatWhyTrainer();
            wwt.startTrain();
            Identifier annotationIdentifier = new Identifier(isAnnotated, wwt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            wwt.endTrain();

            /*ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();*/
            //#endif
        }
Ejemplo n.º 4
0
        private void labelWhy()
        {
            double WEIGHT_PER_MARKER = 0.5;
            double WEIGHT_PER_WHAT = 0.5;
            double CARRY_OVER = 0;

            String[][] markers = new String[][] {
                new String[] { " sanhi sa ", "START" },
                new String[] { " sanhi ng ", "START" },
                new String[] { " sapagkat ", "START" },
                new String[] { " palibhasa ay ", "START" },
                new String[] { " palibhasa ", "START" },
                new String[] { " kasi ", "START" },
                new String[] { " mangyari'y ", "START" },
                new String[] { " mangyari ay ", "START" },
                new String[] { " dahil sa ", "START" },
                new String[] { " dahil na rin sa ", "START" },
                new String[] { " dahil ", "START" },
                new String[] { " dahilan sa", "START" },
                new String[] { " dahilan ", "START" },
                new String[] { " para ", "START" },
                new String[] { " upang ", "START" },
                new String[] { " makaraang ", "START" },
                new String[] { " naglalayong ", "START" },
                new String[] { " kaya ", "END" }
            };

            List<double> candidateWeights = new List<double>();

            if (listWhyCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhyCandidates)
                {
                    String tempWhy = "";
                    String copyWhy = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhy = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhy = tempWhy.Replace("-LRB- ", "(");
                    tempWhy = tempWhy.Replace(" -RRB-", ")");
                    tempWhy = tempWhy.Replace(" . ", ".");
                    tempWhy = tempWhy.Replace(" .", ".");
                    tempWhy = tempWhy.Replace(" ,", ",");
                    tempWhy = tempWhy.Replace(" !", "!");

                    copyWhy = tempWhy;

                    if (tempWhy.Contains(strWhat))
                    {
                        tempWeight += WEIGHT_PER_WHAT;
                    }

                    match = markers.FirstOrDefault(s => tempWhy.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhy = (match[1].Equals("START")) ?
                            tempWhy.Substring(tempWhy.IndexOf(match[0]) + match[0].Count()) :
                            tempWhy.Substring(0, tempWhy.IndexOf(match[0]));
                        tempWeight += WEIGHT_PER_MARKER;
                    }

                    tempWeight += CARRY_OVER;
                    CARRY_OVER = 0;

                    if (strWhat.Contains(tempWhy))
                    {
                        tempWeight = 0;
                    }

                    if (strWhat.Equals(tempWhy))
                    {
                        CARRY_OVER = 0.5;
                    }

                    int position = candidate[0].Position + copyWhy.Substring(0, copyWhy.IndexOf(tempWhy)).Split(' ').Count() - 1;
                    int length = tempWhy.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhy, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhy.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhy.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.Why, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhy = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhyCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.Why.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.Why);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.Why, 0, annotationCurrent.Why.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count - 2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        if (annotationCurrent.Why.Contains(annotationCurrent.What))
                        {
                            tempWeight += WEIGHT_PER_WHAT;
                        }

                        String[] match = markers.FirstOrDefault(s => annotationCurrent.Why.Contains(s[0]));

                        if (match != null)
                        {
                            tempWeight += WEIGHT_PER_MARKER;
                        }

                        tempWeight += CARRY_OVER;
                        CARRY_OVER = 0;

                        if (annotationCurrent.What.Contains(annotationCurrent.Why))
                        {
                            tempWeight = 0;
                        }

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.Why.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.Why.Contains).Count();

                        listSecondaryWhyCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("why", articleCurrent, listSecondaryWhyCandidates);
                }

                listSecondaryWhyCandidates = new List<Candidate>();
            }

            /*Instances whyInstances = createWhyInstances();

            foreach (Instance instance in whyInstances)
            {
                double[] classProbability = whyClassifier.distributionForInstance(instance);
                if (classProbability[0] >= classProbability[1])
                {
                    strWhy = instance.stringValue(0);
                    break;
                }
            }*/
        }
Ejemplo n.º 5
0
        private void labelWhat()
        {
            double WEIGHT_PER_WHO = 0.3;
            double WEIGHT_PER_WHEN = 0.2;
            double WEIGHT_PER_WHERE = 0.2;
            double WEIGHT_PER_SENTENCE = 0.2;
            double WEIGHT_PER_W_IN_TITLE = 0.1;

            List<double> candidateWeights = new List<double>();
            double highestWeight = -1;

            String[][] markers = new String[][] {
                new String[] { "kaya", "START" },
                new String[] { "para", "END" },
                new String[] { "dahil", "END" },
                new String[] { "upang", "END" },
                new String[] { "makaraang", "END" },
            };

            if (listWhatCandidates.Count > 0)
            {
                bool foundMatching = false;
                foreach (List<Token> candidate in listWhatCandidates)
                {
                    String tempWhat = "";
                    String copyWhat = "";
                    double tempWeight = 0;
                    String[] match;

                    tempWhat = String.Join(" ", candidate.Select(token => token.Value).ToArray());
                    tempWhat = tempWhat.Replace("-LRB- ", "(");
                    tempWhat = tempWhat.Replace(" -RRB-", ")");
                    tempWhat = tempWhat.Replace(" . ", ".");
                    tempWhat = tempWhat.Replace(" .", ".");
                    tempWhat = tempWhat.Replace(" ,", ",");
                    tempWhat = tempWhat.Replace(" !", "!");

                    copyWhat = tempWhat;

                    tempWeight += listWho.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHO;
                    tempWeight += listWhen.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHEN;
                    tempWeight += listWhere.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHERE;
                    tempWeight += 1 - WEIGHT_PER_SENTENCE * candidate[0].Sentence;

                    tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                    tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                    candidateWeights.Add(tempWeight);

                    match = markers.FirstOrDefault(s => tempWhat.Contains(s[0]));

                    if (match != null)
                    {
                        tempWhat = (match[1].Equals("START")) ?
                            tempWhat.Substring(tempWhat.IndexOf(match[0]) + match[0].Count() + 1) :
                            tempWhat.Substring(0, tempWhat.IndexOf(match[0]));
                    }

                    int position = candidate[0].Position + copyWhat.Substring(0, copyWhat.IndexOf(tempWhat)).Split(' ').Count() - 1;
                    int length = tempWhat.Split(' ').Count();

                    Candidate newCandidate = new Candidate(tempWhat, position, length);

                    newCandidate.Sentence = candidate[0].Sentence;
                    newCandidate.Score = tempWeight;
                    newCandidate.NumWho = listWho.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhen = listWhen.Where(tempWhat.Contains).Count();
                    newCandidate.NumWhere = listWhere.Where(tempWhat.Contains).Count();

                    if (isAnnotated)
                    {
                        Regex rgx = new Regex("[^a-zA-Z0-9]");
                        var candidateValue = rgx.Replace(newCandidate.Value, "");
                        var annotationValue = rgx.Replace(annotationCurrent.What, "");
                        if (candidateValue == annotationValue)
                        {
                            newCandidate.IsWhat = true;
                            foundMatching = true;
                        }
                    }

                    listSecondaryWhatCandidates.Add(newCandidate);
                }

                if (isAnnotated && !foundMatching && annotationCurrent.What.Length > 0)
                {
                    Preprocessor p = new Preprocessor();
                    List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.What);

                    Candidate newCandidate = new Candidate(
                        annotationCurrent.What, 0, annotationCurrent.What.Split(' ').Count()
                    );

                    int sentenceNumber = -1;
                    int position = -1;

                    for (int i = 0; i < articleCurrent.Count-2; i++)
                    {
                        if (tokenizedAnnotation[0].Value == articleCurrent[i].Value &&
                           (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) &&
                           (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value))
                        {
                            sentenceNumber = articleCurrent[i].Sentence;
                            position = articleCurrent[i].Position;
                            break;
                        }
                    }

                    if (sentenceNumber != -1 && position != -1)
                    {
                        double tempWeight = 0;

                        tempWeight += listWho.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHO;
                        tempWeight += listWhen.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHEN;
                        tempWeight += listWhere.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHERE;
                        tempWeight += 1 - WEIGHT_PER_SENTENCE * sentenceNumber;

                        tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;
                        tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE;

                        newCandidate.Position = position;
                        newCandidate.Sentence = sentenceNumber;
                        newCandidate.Score = tempWeight;
                        newCandidate.NumWho = listWho.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhen = listWhen.Where(annotationCurrent.What.Contains).Count();
                        newCandidate.NumWhere = listWhere.Where(annotationCurrent.What.Contains).Count();

                        listSecondaryWhatCandidates.Add(newCandidate);
                    }
                }

                if (isAnnotated)
                {
                    wwt.train("what", articleCurrent, listSecondaryWhatCandidates);
                }

                listSecondaryWhatCandidates = new List<Candidate>();

                /*Instances whatInstances = createWhatInstances();

                foreach (Instance instance in whatInstances)
                {
                    double[] classProbability = whatClassifier.distributionForInstance(instance);
                    if (classProbability[0] >= classProbability[1])
                    {
                        strWhat = instance.stringValue(0);
                        break;
                    }
                }*/
            }
        }
Ejemplo n.º 6
0
        public static void Main()
        {
            #if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else
            Boolean    isAnnotated               = true;
            FileParser fileparserFP              = new FileParser();
            String     sourcePath                = @"..\..\training_news.xml";
            String     destinationPath           = @"..\..\result.xml";
            String     invertedDestinationPath   = @"..\..\result_inverted_index.xml";
            String     formatDateDestinationPath = @"..\..\result_format_date.xml";

            List <Article>    listCurrentArticles            = fileparserFP.parseFile(sourcePath);
            List <Annotation> listCurrentTrainingAnnotations = new List <Annotation>();
            if (isAnnotated)
            {
                listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List <List <Token> >         listTokenizedArticles   = new List <List <Token> >();
            List <List <Candidate> >     listAllWhoCandidates    = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhenCandidates   = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhereCandidates  = new List <List <Candidate> >();
            List <List <List <Token> > > listAllWhatCandidates   = new List <List <List <Token> > >();
            List <List <List <Token> > > listAllWhyCandidates    = new List <List <List <Token> > >();
            List <List <String> >        listAllWhoAnnotations   = new List <List <String> >();
            List <List <String> >        listAllWhenAnnotations  = new List <List <String> >();
            List <List <String> >        listAllWhereAnnotations = new List <List <String> >();
            List <String> listAllWhatAnnotations = new List <String>();
            List <String> listAllWhyAnnotations  = new List <String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                                  listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                     * trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                     * trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                     * trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/

            /*try
             * {
             *  var whoCandidatesPath = @"..\..\candidates_who.txt";
             *  var whenCandidatesPath = @"..\..\candidates_when.txt";
             *  var whereCandidatesPath = @"..\..\candidates_where.txt";
             *
             *  if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
             *  if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
             *  if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);
             *
             *  using (StreamWriter sw = File.CreateText(whoCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhoCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whenCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhenCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whereCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhereCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             * }
             * catch (Exception e)
             * {
             *  System.Console.WriteLine("Error with writing initial line of training dataset.");
             * }*/
            #endregion

            WhyTrainer wt = new WhyTrainer();
            if (isAnnotated)
            {
                wt.startTrain();
            }
            Identifier annotationIdentifier = new Identifier(isAnnotated, wt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            if (isAnnotated)
            {
                wt.endTrain();
            }

            ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();
            #endif
        }