Пример #1
0
        public Identifier(Boolean isAnnotated, WhyTrainer wt)
        {
            this.isAnnotated = isAnnotated;
            this.wt          = wt;

            listWhoCandidates          = new List <Candidate>();
            listWhenCandidates         = new List <Candidate>();
            listWhereCandidates        = new List <Candidate>();
            listWhatCandidates         = new List <List <Token> >();
            listWhyCandidates          = new List <List <Token> >();
            listSecondaryWhyCandidates = new List <Candidate>();

            fvPOS = new FastVector(Token.PartOfSpeechTags.Length);
            foreach (String POS in Token.PartOfSpeechTags)
            {
                fvPOS.addElement(POS);
            }

            whoClassifier   = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\who.model");
            whenClassifier  = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\when.model");
            whereClassifier = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\where.model");
            whyClassifier   = (Classifier)SerializationHelper.read(@"..\..\IdentifierModels\why.model");

            initializeAnnotations();
        }
Пример #2
0
        public static void Main()
        {
            #if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else
            Boolean    isAnnotated               = true;
            FileParser fileparserFP              = new FileParser();
            String     sourcePath                = @"..\..\training_news.xml";
            String     destinationPath           = @"..\..\result.xml";
            String     invertedDestinationPath   = @"..\..\result_inverted_index.xml";
            String     formatDateDestinationPath = @"..\..\result_format_date.xml";

            List <Article>    listCurrentArticles            = fileparserFP.parseFile(sourcePath);
            List <Annotation> listCurrentTrainingAnnotations = new List <Annotation>();
            if (isAnnotated)
            {
                listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List <List <Token> >         listTokenizedArticles   = new List <List <Token> >();
            List <List <Candidate> >     listAllWhoCandidates    = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhenCandidates   = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhereCandidates  = new List <List <Candidate> >();
            List <List <List <Token> > > listAllWhatCandidates   = new List <List <List <Token> > >();
            List <List <List <Token> > > listAllWhyCandidates    = new List <List <List <Token> > >();
            List <List <String> >        listAllWhoAnnotations   = new List <List <String> >();
            List <List <String> >        listAllWhenAnnotations  = new List <List <String> >();
            List <List <String> >        listAllWhereAnnotations = new List <List <String> >();
            List <String> listAllWhatAnnotations = new List <String>();
            List <String> listAllWhyAnnotations  = new List <String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                                  listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                     * trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                     * trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                     * trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/

            /*try
             * {
             *  var whoCandidatesPath = @"..\..\candidates_who.txt";
             *  var whenCandidatesPath = @"..\..\candidates_when.txt";
             *  var whereCandidatesPath = @"..\..\candidates_where.txt";
             *
             *  if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
             *  if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
             *  if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);
             *
             *  using (StreamWriter sw = File.CreateText(whoCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhoCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whenCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhenCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whereCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhereCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             * }
             * catch (Exception e)
             * {
             *  System.Console.WriteLine("Error with writing initial line of training dataset.");
             * }*/
            #endregion

            WhyTrainer wt = new WhyTrainer();
            if (isAnnotated)
            {
                wt.startTrain();
            }
            Identifier annotationIdentifier = new Identifier(isAnnotated, wt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            if (isAnnotated)
            {
                wt.endTrain();
            }

            ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();
            #endif
        }