예제 #1
0
        public static void Main()
        {
            /*#if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else*/
            Boolean isAnnotated = true;
            FileParser fileparserFP = new FileParser();
            String sourcePath = @"..\..\training_news.xml";
            String destinationPath = @"..\..\result.xml";
            String invertedDestinationPath = @"..\..\result_inverted_index.xml";
            String formatDateDestinationPath = @"..\..\result_format_date.xml";

            List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePath);
            List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>();
            if (isAnnotated)
            {
                 listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List<List<Token>> listTokenizedArticles = new List<List<Token>>();
            List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>();
            List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>();
            List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>();
            List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>();
            List<List<String>> listAllWhoAnnotations = new List<List<String>>();
            List<List<String>> listAllWhenAnnotations = new List<List<String>>();
            List<List<String>> listAllWhereAnnotations = new List<List<String>>();
            List<String> listAllWhatAnnotations = new List<String>();
            List<String> listAllWhyAnnotations = new List<String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                    trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                    trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                    trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/
            /*try
            {
                var whoCandidatesPath = @"..\..\candidates_who.txt";
                var whenCandidatesPath = @"..\..\candidates_when.txt";
                var whereCandidatesPath = @"..\..\candidates_where.txt";

                if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
                if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
                if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);

                using (StreamWriter sw = File.CreateText(whoCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhoCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whenCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhenCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
                using (StreamWriter sw = File.CreateText(whereCandidatesPath))
                {
                    for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
                    {
                        sw.WriteLine("#{0}:", nI);
                        foreach (var candidate in listAllWhereCandidates[nI])
                        {
                            sw.Write(candidate.Value + ", ");
                        }
                        sw.WriteLine("\n");
                    }
                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine("Error with writing initial line of training dataset.");
            }*/
            #endregion

            WhatWhyTrainer wwt = new WhatWhyTrainer();
            wwt.startTrain();
            Identifier annotationIdentifier = new Identifier(isAnnotated, wwt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            wwt.endTrain();

            /*ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();*/
            //#endif
        }
예제 #2
0
파일: Main.cs 프로젝트: andibandiii/IE
        private void btnImport_Click(object sender, EventArgs e)
        {
            if (!textBoxes[tabControl1.SelectedIndex].Text.Equals(""))
            {
                FileInfo fi = new FileInfo(textBoxes[tabControl1.SelectedIndex].Text);


                if (File.Exists(fi.FullName) && fi.Extension.Equals(".xml"))
                {
                    sourcePaths[tabControl1.SelectedIndex] = fi.FullName;

                    if (tabControl1.SelectedIndex > 0)
                    {
                        List <Article>    listArticles    = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]);
                        List <Annotation> listAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]);

                        if (listArticles.Count <= 0)
                        {
                            MessageBox.Show("No articles found!");
                            return;
                        }

                        foreach (int i in Enumerable.Range(0, listAnnotations.Count()))
                        {
                            listAnnotations[i].Index = i;
                            Console.WriteLine(listArticles[i].Title + " " + i);
                        }

                        if (tabControl1.SelectedIndex == 1)
                        {
                            listViewerArticles    = listArticles;
                            listViewerAnnotations = listAnnotations;

                            loadArticles();
                        }
                        else if (tabControl1.SelectedIndex == 2)
                        {
                            String formatDateDestinationPath = fi.FullName.Insert(fi.FullName.Length - 4, "_inverted_index");

                            if (File.Exists(formatDateDestinationPath))
                            {
                                listNavigatorArticles    = listArticles;
                                listNavigatorAnnotations = listAnnotations;

                                XmlDocument doc = new XmlDocument();

                                doc.Load(formatDateDestinationPath);

                                XmlNodeList whoNodes   = doc.DocumentElement.SelectNodes("/data/who/entry");
                                XmlNodeList whenNodes  = doc.DocumentElement.SelectNodes("/data/when/entry");
                                XmlNodeList whereNodes = doc.DocumentElement.SelectNodes("/data/where/entry");
                                XmlNodeList whatNodes  = doc.DocumentElement.SelectNodes("/data/what/entry");
                                XmlNodeList whyNodes   = doc.DocumentElement.SelectNodes("/data/why/entry");

                                foreach (XmlNode entry in whoNodes)
                                {
                                    List <int> indices = new List <int>();
                                    foreach (XmlNode index in entry.SelectNodes("articleIndex"))
                                    {
                                        indices.Add(Convert.ToInt32(index.InnerText));
                                    }
                                    whoReverseIndex.Add(entry["text"].InnerText, indices);
                                }

                                foreach (XmlNode entry in whenNodes)
                                {
                                    List <int> indices = new List <int>();
                                    foreach (XmlNode index in entry.SelectNodes("articleIndex"))
                                    {
                                        indices.Add(Convert.ToInt32(index.InnerText));
                                    }
                                    whenReverseIndex.Add(entry.SelectSingleNode("text").InnerText, indices);
                                }

                                foreach (XmlNode entry in whereNodes)
                                {
                                    List <int> indices = new List <int>();
                                    foreach (XmlNode index in entry.SelectNodes("articleIndex"))
                                    {
                                        indices.Add(Convert.ToInt32(index.InnerText));
                                    }
                                    whereReverseIndex.Add(entry.SelectSingleNode("text").InnerText, indices);
                                }

                                foreach (XmlNode entry in whatNodes)
                                {
                                    List <int> indices = new List <int>();
                                    foreach (XmlNode index in entry.SelectNodes("articleIndex"))
                                    {
                                        indices.Add(Convert.ToInt32(index.InnerText));
                                    }
                                    whatReverseIndex.Add(entry.SelectSingleNode("text").InnerText, indices);
                                }

                                foreach (XmlNode entry in whyNodes)
                                {
                                    List <int> indices = new List <int>();
                                    foreach (XmlNode index in entry.SelectNodes("articleIndex"))
                                    {
                                        indices.Add(Convert.ToInt32(index.InnerText));
                                    }
                                    whyReverseIndex.Add(entry.SelectSingleNode("text").InnerText, indices);
                                }
                            }
                            else
                            {
                                MessageBox.Show("Inverted index file not found!");
                                return;
                            }
                        }
                    }

                    //firstBoxes[tabControl1.SelectedIndex].Enabled = false;
                    secondBoxes[tabControl1.SelectedIndex].Enabled = true;
                }
            }
        }
예제 #3
0
파일: Program.cs 프로젝트: andibandiii/IE
        public static void Main()
        {
            #if DEBUG
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Main());
            #else
            Boolean    isAnnotated               = true;
            FileParser fileparserFP              = new FileParser();
            String     sourcePath                = @"..\..\training_news.xml";
            String     destinationPath           = @"..\..\result.xml";
            String     invertedDestinationPath   = @"..\..\result_inverted_index.xml";
            String     formatDateDestinationPath = @"..\..\result_format_date.xml";

            List <Article>    listCurrentArticles            = fileparserFP.parseFile(sourcePath);
            List <Annotation> listCurrentTrainingAnnotations = new List <Annotation>();
            if (isAnnotated)
            {
                listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath);
            }
            List <List <Token> >         listTokenizedArticles   = new List <List <Token> >();
            List <List <Candidate> >     listAllWhoCandidates    = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhenCandidates   = new List <List <Candidate> >();
            List <List <Candidate> >     listAllWhereCandidates  = new List <List <Candidate> >();
            List <List <List <Token> > > listAllWhatCandidates   = new List <List <List <Token> > >();
            List <List <List <Token> > > listAllWhyCandidates    = new List <List <List <Token> > >();
            List <List <String> >        listAllWhoAnnotations   = new List <List <String> >();
            List <List <String> >        listAllWhenAnnotations  = new List <List <String> >();
            List <List <String> >        listAllWhereAnnotations = new List <List <String> >();
            List <String> listAllWhatAnnotations = new List <String>();
            List <String> listAllWhyAnnotations  = new List <String>();

            Preprocessor preprocessor = new Preprocessor();

            if (listCurrentArticles != null && listCurrentArticles.Count > 0 &&
                (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 &&
                                  listCurrentArticles.Count == listCurrentTrainingAnnotations.Count)))
            {
                //Temporarily set to 2 because getting all articles takes longer run time
                for (int nI = 0; nI < listCurrentArticles.Count; nI++)
                {
                    preprocessor.setCurrentArticle(listCurrentArticles[nI]);
                    preprocessor.preprocess();

                    if (isAnnotated)
                    {
                        preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                        preprocessor.performAnnotationAssignment();
                    }

                    listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle());
                    listAllWhoCandidates.Add(preprocessor.getWhoCandidates());
                    listAllWhenCandidates.Add(preprocessor.getWhenCandidates());
                    listAllWhereCandidates.Add(preprocessor.getWhereCandidates());
                    listAllWhatCandidates.Add(preprocessor.getWhatCandidates());
                    listAllWhyCandidates.Add(preprocessor.getWhyCandidates());
                }

                if (isAnnotated)
                {
                    /*Trainer trainer = new Trainer();
                     * trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates);
                     * trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates);
                     * trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/
                }
            }

            #region Candidate Selection Printer
            /*Candidate Selection Printer*/

            /*try
             * {
             *  var whoCandidatesPath = @"..\..\candidates_who.txt";
             *  var whenCandidatesPath = @"..\..\candidates_when.txt";
             *  var whereCandidatesPath = @"..\..\candidates_where.txt";
             *
             *  if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath);
             *  if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath);
             *  if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath);
             *
             *  using (StreamWriter sw = File.CreateText(whoCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhoCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhoCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whenCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhenCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhenCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             *  using (StreamWriter sw = File.CreateText(whereCandidatesPath))
             *  {
             *      for (int nI = 0; nI < listAllWhereCandidates.Count; nI++)
             *      {
             *          sw.WriteLine("#{0}:", nI);
             *          foreach (var candidate in listAllWhereCandidates[nI])
             *          {
             *              sw.Write(candidate.Value + ", ");
             *          }
             *          sw.WriteLine("\n");
             *      }
             *  }
             * }
             * catch (Exception e)
             * {
             *  System.Console.WriteLine("Error with writing initial line of training dataset.");
             * }*/
            #endregion

            WhyTrainer wt = new WhyTrainer();
            if (isAnnotated)
            {
                wt.startTrain();
            }
            Identifier annotationIdentifier = new Identifier(isAnnotated, wt);
            for (int nI = 0; nI < listCurrentArticles.Count; nI++)
            {
                annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]);
                annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]);
                annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]);
                annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]);
                annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]);
                annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]);
                annotationIdentifier.setTitle(listCurrentArticles[nI].Title);
                if (isAnnotated)
                {
                    annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]);
                }
                annotationIdentifier.labelAnnotations();
                listAllWhoAnnotations.Add(annotationIdentifier.getWho());
                listAllWhenAnnotations.Add(annotationIdentifier.getWhen());
                listAllWhereAnnotations.Add(annotationIdentifier.getWhere());
                listAllWhatAnnotations.Add(annotationIdentifier.getWhat());
                listAllWhyAnnotations.Add(annotationIdentifier.getWhy());
            }
            if (isAnnotated)
            {
                wt.endTrain();
            }

            ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations);
            rw.generateOutput();
            rw.generateOutputFormatDate();
            rw.generateInvertedIndexOutput();
            #endif
        }