Exemplo n.º 1
0
        private Boolean extract(String destinationPath, String invertedDestinationPath, String formatDateDestinationPath)
        {
            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]);
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            //List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();

            //listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]);

            if (listCurrentArticles != null && listCurrentArticles.Count > 0)
            {
                Preprocessor preprocessor = new Preprocessor();
                float precisionWho = 0;
                float recallWho = 0;
                float precisionWhen = 0;
                float recallWhen = 0;
                float precisionWhere = 0;
                float recallWhere = 0;
                float precisionWhat = 0;
                float recallWhat = 0;
                float precisionWhy = 0;
                float recallWhy = 0;
                float totalWho = 0;
                float totalWhen = 0;
                float totalWhere = 0;
                float totalWhat = 0;
                float totalWhy = 0;
                float sentenceZeroWhat = 0;
                float sentenceOneWhat = 0;
                float sentenceTwoWhat = 0;
                float sentenceThreeWhat = 0;
                float sentenceFourWhat = 0;
                float sentenceFiveWhat = 0;
                float sentenceZeroWhy = 0;
                float sentenceOneWhy = 0;
                float sentenceTwoWhy = 0;
                float sentenceThreeWhy = 0;
                float sentenceFourWhy = 0;
                float sentenceFiveWhy = 0;

                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    float[][] statistics;
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());

                    /*preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                    statistics = preprocessor.performAnnotationAssignment();

                    if (statistics != null)
                    {
                        recallWho += statistics[0][0];
                        recallWhen += statistics[1][0];
                        recallWhere += statistics[2][0];
                        recallWhat += statistics[3][0];
                        recallWhy += statistics[4][0];
                        precisionWho += statistics[0][1];
                        precisionWhen += statistics[1][1];
                        precisionWhere += statistics[2][1];
                        precisionWhat += statistics[3][1];
                        precisionWhy += statistics[4][1];
                        totalWho += statistics[0][2];
                        totalWhen += statistics[1][2];
                        totalWhere += statistics[2][2];
                        totalWhat += statistics[3][2];
                        totalWhy += statistics[4][2];
                        int sentenceNumber = (int)statistics[3][3];
                        switch (sentenceNumber)
                        {
                            case -1:
                                break;
                            case 0:
                                sentenceZeroWhat += 1;
                                break;
                            case 1:
                                sentenceOneWhat += 1;
                                break;
                            case 2:
                                sentenceTwoWhat += 1;
                                break;
                            case 3:
                                sentenceThreeWhat += 1;
                                break;
                            case 4:
                                sentenceFourWhat += 1;
                                break;
                            case 5:
                                sentenceFiveWhat += 1;
                                break;
                            default:
                                sentenceFiveWhat += 1;
                                break;
                        }
                        sentenceNumber = (int)statistics[4][3];
                        switch (sentenceNumber)
                        {
                            case -1:
                                break;
                            case 0:
                                sentenceZeroWhy += 1;
                                break;
                            case 1:
                                sentenceOneWhy += 1;
                                break;
                            case 2:
                                sentenceTwoWhy += 1;
                                break;
                            case 3:
                                sentenceThreeWhy += 1;
                                break;
                            case 4:
                                sentenceFourWhy += 1;
                                break;
                            case 5:
                                sentenceFiveWhy += 1;
                                break;
                            default:
                                sentenceFiveWhy += 1;
                                break;
                        }
                    }

                    System.Console.WriteLine("Article #{0}", nI + 1);
                    System.Console.WriteLine("Recall Who: " + statistics[0][0]);
                    System.Console.WriteLine("Recall When: " + statistics[1][0]);
                    System.Console.WriteLine("Recall Where: " + statistics[2][0]);
                    System.Console.WriteLine("Recall What: " + statistics[3][0]);
                    System.Console.WriteLine("Recall Why: " + statistics[4][0]);
                    System.Console.WriteLine("Precision Who: " + statistics[0][1]);
                    System.Console.WriteLine("Precision When: " + statistics[1][1]);
                    System.Console.WriteLine("Precision Where: " + statistics[2][1]);
                    System.Console.WriteLine("Precision What: " + statistics[3][1]);
                    System.Console.WriteLine("Precision Why: " + statistics[4][1]);*/
                }

                //System.Console.WriteLine("Average Statistics");
                //System.Console.WriteLine("Recall Who: " + recallWho / totalWho);
                //System.Console.WriteLine("Recall When: " + recallWhen / totalWhen);
                //System.Console.WriteLine("Recall Where: " + recallWhere / totalWhere);
                //System.Console.WriteLine("Recall What: " + recallWhat / totalWhat);
                //System.Console.WriteLine("Recall Why: " + recallWhy / totalWhy);
                //System.Console.WriteLine("Precision Who: " + precisionWho / totalWho);
                //System.Console.WriteLine("Precision When: " + precisionWhen / totalWhere);
                //System.Console.WriteLine("Precision Where: " + precisionWhere / totalWhen);
                //System.Console.WriteLine("Precision What: " + precisionWhat / totalWhat);
                //System.Console.WriteLine("Precision Why: " + precisionWhy / totalWhy);
                //System.Console.WriteLine("What sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhat + " Percentage: " + sentenceZeroWhat/ totalWhat);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhat + " Percentage: " + sentenceOneWhat / totalWhat);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhat + " Percentage: " + sentenceTwoWhat / totalWhat);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhat + " Percentage: " + sentenceThreeWhat / totalWhat);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhat + " Percentage: " + sentenceFourWhat / totalWhat);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhat + " Percentage: " + sentenceFiveWhat / totalWhat);
                //System.Console.WriteLine("Why sentence location :");
                //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhy + " Percentage: " + sentenceZeroWhy / totalWhy);
                //System.Console.WriteLine("Sentence 1: " + sentenceOneWhy + " Percentage: " + sentenceOneWhy / totalWhy);
                //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhy + " Percentage: " + sentenceTwoWhy / totalWhy);
                //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhy + " Percentage: " + sentenceThreeWhy / totalWhy);
                //System.Console.WriteLine("Sentence 4: " + sentenceFourWhy + " Percentage: " + sentenceFourWhy / totalWhy);
                //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhy + " Percentage: " + sentenceFiveWhy / totalWhy);
            }
            else
            {
                MessageBox.Show("Invalid XML File!");
                return false;
            }

            Identifier annotationIdentifier = new Identifier(false, null);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }

            ResultWriter rw = new ResultWriter(destinationPath, invertedDestinationPath, formatDateDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();

            return true;
        }
Exemplo n.º 2
0
        public static void Main()
        {
            /*#if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else*/
            Boolean isAnnotated = true;
            FileParser fileparserFP = new FileParser();
            String sourcePath = @"..\..\training_news.xml";
            String destinationPath = @"..\..\result.xml";
            String invertedDestinationPath = @"..\..\result_inverted_index.xml";
            String formatDateDestinationPath = @"..\..\result_format_date.xml";

            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePath);
            List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();
            if (isAnnotated)
            {
                 listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                    trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                    trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                    trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/
            /*try
            {
                var whoCandidatesPath = @"..\..\candidates_who.txt";
                var whenCandidatesPath = @"..\..\candidates_when.txt";
                var whereCandidatesPath = @"..\..\candidates_where.txt";

                if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
                if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
                if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);

                using (StreamWriter sw = File.CreateText(whoCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhoCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whenCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhenCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whereCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhereCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine("Error with writing initial line of training dataset.");
            }*/
            #endregion

            WhatWhyTrainer wwt = new WhatWhyTrainer();
            wwt.startTrain();
            Identifier annotationIdentifier = new Identifier(isAnnotated, wwt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            wwt.endTrain();

            /*ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();*/
            //#endif
        }