Beispiel #1
0
        public Identifier(Boolean isAnnotated, WhatWhyTrainer wwt)
        {
            this.isAnnotated = isAnnotated;
            this.wwt = wwt;

            listWhoCandidates = new List<Candidate>();
            listWhenCandidates = new List<Candidate>();
            listWhereCandidates = new List<Candidate>();
            listWhatCandidates = new List<List<Token>>();
            listWhyCandidates = new List<List<Token>>();
            listSecondaryWhatCandidates = new List<Candidate>();
            listSecondaryWhyCandidates = new List<Candidate>();

            fvPOS = new FastVector(Token.PartOfSpeechTags.Length);
            foreach (String POS in Token.PartOfSpeechTags)
            {
                fvPOS.addElement(POS);
            }

            whoClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\who.model");
            whenClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\when.model");
            whereClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\where.model");
            //whatClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\what.model");
            //whyClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\why.model");

            initializeAnnotations();
        }
Beispiel #2
0
        public static void Main()
        {
            /*#if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else*/
            Boolean isAnnotated = true;
            FileParser fileparserFP = new FileParser();
            String sourcePath = @"..\..\training_news.xml";
            String destinationPath = @"..\..\result.xml";
            String invertedDestinationPath = @"..\..\result_inverted_index.xml";
            String formatDateDestinationPath = @"..\..\result_format_date.xml";

            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePath);
            List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();
            if (isAnnotated)
            {
                 listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                    trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                    trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                    trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/
            /*try
            {
                var whoCandidatesPath = @"..\..\candidates_who.txt";
                var whenCandidatesPath = @"..\..\candidates_when.txt";
                var whereCandidatesPath = @"..\..\candidates_where.txt";

                if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
                if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
                if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);

                using (StreamWriter sw = File.CreateText(whoCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhoCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whenCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhenCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whereCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhereCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine("Error with writing initial line of training dataset.");
            }*/
            #endregion

            WhatWhyTrainer wwt = new WhatWhyTrainer();
            wwt.startTrain();
            Identifier annotationIdentifier = new Identifier(isAnnotated, wwt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            wwt.endTrain();

            /*ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();*/
            //#endif
        }