private Boolean extract(String destinationPath, String invertedDestinationPath, String formatDateDestinationPath) { List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]); List<List<Token>> listTokenizedArticles = new List<List<Token>>(); List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>(); List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>(); List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>(); List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>(); List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>(); List<List<String>> listAllWhoAnnotations = new List<List<String>>(); List<List<String>> listAllWhenAnnotations = new List<List<String>>(); List<List<String>> listAllWhereAnnotations = new List<List<String>>(); List<String> listAllWhatAnnotations = new List<String>(); List<String> listAllWhyAnnotations = new List<String>(); //List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>(); //listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]); if (listCurrentArticles != null && listCurrentArticles.Count > 0) { Preprocessor preprocessor = new Preprocessor(); float precisionWho = 0; float recallWho = 0; float precisionWhen = 0; float recallWhen = 0; float precisionWhere = 0; float recallWhere = 0; float precisionWhat = 0; float recallWhat = 0; float precisionWhy = 0; float recallWhy = 0; float totalWho = 0; float totalWhen = 0; float totalWhere = 0; float totalWhat = 0; float totalWhy = 0; float sentenceZeroWhat = 0; float sentenceOneWhat = 0; float sentenceTwoWhat = 0; float sentenceThreeWhat = 0; float sentenceFourWhat = 0; float sentenceFiveWhat = 0; float sentenceZeroWhy = 0; float sentenceOneWhy = 0; float sentenceTwoWhy = 0; float sentenceThreeWhy = 0; float sentenceFourWhy = 0; float sentenceFiveWhy = 0; //Temporarily set to 2 because getting all articles takes longer run time for (int nI = 0; nI < listCurrentArticles.Count; nI++) { float[][] statistics; preprocessor.setCurrentArticle(listCurrentArticles[nI]); preprocessor.preprocess(); listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle()); listAllWhoCandidates.Add(preprocessor.getWhoCandidates()); listAllWhenCandidates.Add(preprocessor.getWhenCandidates()); listAllWhereCandidates.Add(preprocessor.getWhereCandidates()); listAllWhatCandidates.Add(preprocessor.getWhatCandidates()); listAllWhyCandidates.Add(preprocessor.getWhyCandidates()); /*preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); statistics = preprocessor.performAnnotationAssignment(); if (statistics != null) { recallWho += statistics[0][0]; recallWhen += statistics[1][0]; recallWhere += statistics[2][0]; recallWhat += statistics[3][0]; recallWhy += statistics[4][0]; precisionWho += statistics[0][1]; precisionWhen += statistics[1][1]; precisionWhere += statistics[2][1]; precisionWhat += statistics[3][1]; precisionWhy += statistics[4][1]; totalWho += statistics[0][2]; totalWhen += statistics[1][2]; totalWhere += statistics[2][2]; totalWhat += statistics[3][2]; totalWhy += statistics[4][2]; int sentenceNumber = (int)statistics[3][3]; switch (sentenceNumber) { case -1: break; case 0: sentenceZeroWhat += 1; break; case 1: sentenceOneWhat += 1; break; case 2: sentenceTwoWhat += 1; break; case 3: sentenceThreeWhat += 1; break; case 4: sentenceFourWhat += 1; break; case 5: sentenceFiveWhat += 1; break; default: sentenceFiveWhat += 1; break; } sentenceNumber = (int)statistics[4][3]; switch (sentenceNumber) { case -1: break; case 0: sentenceZeroWhy += 1; break; case 1: sentenceOneWhy += 1; break; case 2: sentenceTwoWhy += 1; break; case 3: sentenceThreeWhy += 1; break; case 4: sentenceFourWhy += 1; break; case 5: sentenceFiveWhy += 1; break; default: sentenceFiveWhy += 1; break; } } System.Console.WriteLine("Article #{0}", nI + 1); System.Console.WriteLine("Recall Who: " + statistics[0][0]); System.Console.WriteLine("Recall When: " + statistics[1][0]); System.Console.WriteLine("Recall Where: " + statistics[2][0]); System.Console.WriteLine("Recall What: " + statistics[3][0]); System.Console.WriteLine("Recall Why: " + statistics[4][0]); System.Console.WriteLine("Precision Who: " + statistics[0][1]); System.Console.WriteLine("Precision When: " + statistics[1][1]); System.Console.WriteLine("Precision Where: " + statistics[2][1]); System.Console.WriteLine("Precision What: " + statistics[3][1]); System.Console.WriteLine("Precision Why: " + statistics[4][1]);*/ } //System.Console.WriteLine("Average Statistics"); //System.Console.WriteLine("Recall Who: " + recallWho / totalWho); //System.Console.WriteLine("Recall When: " + recallWhen / totalWhen); //System.Console.WriteLine("Recall Where: " + recallWhere / totalWhere); //System.Console.WriteLine("Recall What: " + recallWhat / totalWhat); //System.Console.WriteLine("Recall Why: " + recallWhy / totalWhy); //System.Console.WriteLine("Precision Who: " + precisionWho / totalWho); //System.Console.WriteLine("Precision When: " + precisionWhen / totalWhere); //System.Console.WriteLine("Precision Where: " + precisionWhere / totalWhen); //System.Console.WriteLine("Precision What: " + precisionWhat / totalWhat); //System.Console.WriteLine("Precision Why: " + precisionWhy / totalWhy); //System.Console.WriteLine("What sentence location :"); //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhat + " Percentage: " + sentenceZeroWhat/ totalWhat); //System.Console.WriteLine("Sentence 1: " + sentenceOneWhat + " Percentage: " + sentenceOneWhat / totalWhat); //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhat + " Percentage: " + sentenceTwoWhat / totalWhat); //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhat + " Percentage: " + sentenceThreeWhat / totalWhat); //System.Console.WriteLine("Sentence 4: " + sentenceFourWhat + " Percentage: " + sentenceFourWhat / totalWhat); //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhat + " Percentage: " + sentenceFiveWhat / totalWhat); //System.Console.WriteLine("Why sentence location :"); //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhy + " Percentage: " + sentenceZeroWhy / totalWhy); //System.Console.WriteLine("Sentence 1: " + sentenceOneWhy + " Percentage: " + sentenceOneWhy / totalWhy); //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhy + " Percentage: " + sentenceTwoWhy / totalWhy); //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhy + " Percentage: " + sentenceThreeWhy / totalWhy); //System.Console.WriteLine("Sentence 4: " + sentenceFourWhy + " Percentage: " + sentenceFourWhy / totalWhy); //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhy + " Percentage: " + sentenceFiveWhy / totalWhy); } else { MessageBox.Show("Invalid XML File!"); return false; } Identifier annotationIdentifier = new Identifier(false, null); for (int nI = 0; nI < listCurrentArticles.Count; nI++) { annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]); annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]); annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]); annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]); annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]); annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]); annotationIdentifier.setTitle(listCurrentArticles[nI].Title); annotationIdentifier.labelAnnotations(); listAllWhoAnnotations.Add(annotationIdentifier.getWho()); listAllWhenAnnotations.Add(annotationIdentifier.getWhen()); listAllWhereAnnotations.Add(annotationIdentifier.getWhere()); listAllWhatAnnotations.Add(annotationIdentifier.getWhat()); listAllWhyAnnotations.Add(annotationIdentifier.getWhy()); } ResultWriter rw = new ResultWriter(destinationPath, invertedDestinationPath, formatDateDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations); rw.generateOutput(); rw.generateOutputFormatDate(); rw.generateInvertedIndexOutput(); return true; }
private Boolean extract(String destinationPath, String invertedDestinationPath, String formatDateDestinationPath) { List <Article> listCurrentArticles = fileparserFP.parseFile(sourcePaths[tabControl1.SelectedIndex]); List <List <Token> > listTokenizedArticles = new List <List <Token> >(); List <List <Candidate> > listAllWhoCandidates = new List <List <Candidate> >(); List <List <Candidate> > listAllWhenCandidates = new List <List <Candidate> >(); List <List <Candidate> > listAllWhereCandidates = new List <List <Candidate> >(); List <List <List <Token> > > listAllWhatCandidates = new List <List <List <Token> > >(); List <List <List <Token> > > listAllWhyCandidates = new List <List <List <Token> > >(); List <List <String> > listAllWhoAnnotations = new List <List <String> >(); List <List <String> > listAllWhenAnnotations = new List <List <String> >(); List <List <String> > listAllWhereAnnotations = new List <List <String> >(); List <String> listAllWhatAnnotations = new List <String>(); List <String> listAllWhyAnnotations = new List <String>(); //List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>(); //listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePaths[tabControl1.SelectedIndex]); if (listCurrentArticles != null && listCurrentArticles.Count > 0) { Preprocessor preprocessor = new Preprocessor(); float precisionWho = 0; float recallWho = 0; float precisionWhen = 0; float recallWhen = 0; float precisionWhere = 0; float recallWhere = 0; float precisionWhat = 0; float recallWhat = 0; float precisionWhy = 0; float recallWhy = 0; float totalWho = 0; float totalWhen = 0; float totalWhere = 0; float totalWhat = 0; float totalWhy = 0; float sentenceZeroWhat = 0; float sentenceOneWhat = 0; float sentenceTwoWhat = 0; float sentenceThreeWhat = 0; float sentenceFourWhat = 0; float sentenceFiveWhat = 0; float sentenceZeroWhy = 0; float sentenceOneWhy = 0; float sentenceTwoWhy = 0; float sentenceThreeWhy = 0; float sentenceFourWhy = 0; float sentenceFiveWhy = 0; //Temporarily set to 2 because getting all articles takes longer run time for (int nI = 0; nI < listCurrentArticles.Count; nI++) { float[][] statistics; preprocessor.setCurrentArticle(listCurrentArticles[nI]); preprocessor.preprocess(); listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle()); listAllWhoCandidates.Add(preprocessor.getWhoCandidates()); listAllWhenCandidates.Add(preprocessor.getWhenCandidates()); listAllWhereCandidates.Add(preprocessor.getWhereCandidates()); listAllWhatCandidates.Add(preprocessor.getWhatCandidates()); listAllWhyCandidates.Add(preprocessor.getWhyCandidates()); /*preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); * statistics = preprocessor.performAnnotationAssignment(); * * if (statistics != null) * { * recallWho += statistics[0][0]; * recallWhen += statistics[1][0]; * recallWhere += statistics[2][0]; * recallWhat += statistics[3][0]; * recallWhy += statistics[4][0]; * precisionWho += statistics[0][1]; * precisionWhen += statistics[1][1]; * precisionWhere += statistics[2][1]; * precisionWhat += statistics[3][1]; * precisionWhy += statistics[4][1]; * totalWho += statistics[0][2]; * totalWhen += statistics[1][2]; * totalWhere += statistics[2][2]; * totalWhat += statistics[3][2]; * totalWhy += statistics[4][2]; * int sentenceNumber = (int)statistics[3][3]; * switch (sentenceNumber) * { * case -1: * break; * case 0: * sentenceZeroWhat += 1; * break; * case 1: * sentenceOneWhat += 1; * break; * case 2: * sentenceTwoWhat += 1; * break; * case 3: * sentenceThreeWhat += 1; * break; * case 4: * sentenceFourWhat += 1; * break; * case 5: * sentenceFiveWhat += 1; * break; * default: * sentenceFiveWhat += 1; * break; * } * sentenceNumber = (int)statistics[4][3]; * switch (sentenceNumber) * { * case -1: * break; * case 0: * sentenceZeroWhy += 1; * break; * case 1: * sentenceOneWhy += 1; * break; * case 2: * sentenceTwoWhy += 1; * break; * case 3: * sentenceThreeWhy += 1; * break; * case 4: * sentenceFourWhy += 1; * break; * case 5: * sentenceFiveWhy += 1; * break; * default: * sentenceFiveWhy += 1; * break; * } * } * * System.Console.WriteLine("Article #{0}", nI + 1); * System.Console.WriteLine("Recall Who: " + statistics[0][0]); * System.Console.WriteLine("Recall When: " + statistics[1][0]); * System.Console.WriteLine("Recall Where: " + statistics[2][0]); * System.Console.WriteLine("Recall What: " + statistics[3][0]); * System.Console.WriteLine("Recall Why: " + statistics[4][0]); * System.Console.WriteLine("Precision Who: " + statistics[0][1]); * System.Console.WriteLine("Precision When: " + statistics[1][1]); * System.Console.WriteLine("Precision Where: " + statistics[2][1]); * System.Console.WriteLine("Precision What: " + statistics[3][1]); * System.Console.WriteLine("Precision Why: " + statistics[4][1]);*/ } //System.Console.WriteLine("Average Statistics"); //System.Console.WriteLine("Recall Who: " + recallWho / totalWho); //System.Console.WriteLine("Recall When: " + recallWhen / totalWhen); //System.Console.WriteLine("Recall Where: " + recallWhere / totalWhere); //System.Console.WriteLine("Recall What: " + recallWhat / totalWhat); //System.Console.WriteLine("Recall Why: " + recallWhy / totalWhy); //System.Console.WriteLine("Precision Who: " + precisionWho / totalWho); //System.Console.WriteLine("Precision When: " + precisionWhen / totalWhere); //System.Console.WriteLine("Precision Where: " + precisionWhere / totalWhen); //System.Console.WriteLine("Precision What: " + precisionWhat / totalWhat); //System.Console.WriteLine("Precision Why: " + precisionWhy / totalWhy); //System.Console.WriteLine("What sentence location :"); //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhat + " Percentage: " + sentenceZeroWhat/ totalWhat); //System.Console.WriteLine("Sentence 1: " + sentenceOneWhat + " Percentage: " + sentenceOneWhat / totalWhat); //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhat + " Percentage: " + sentenceTwoWhat / totalWhat); //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhat + " Percentage: " + sentenceThreeWhat / totalWhat); //System.Console.WriteLine("Sentence 4: " + sentenceFourWhat + " Percentage: " + sentenceFourWhat / totalWhat); //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhat + " Percentage: " + sentenceFiveWhat / totalWhat); //System.Console.WriteLine("Why sentence location :"); //System.Console.WriteLine("Sentence 0: " + sentenceZeroWhy + " Percentage: " + sentenceZeroWhy / totalWhy); //System.Console.WriteLine("Sentence 1: " + sentenceOneWhy + " Percentage: " + sentenceOneWhy / totalWhy); //System.Console.WriteLine("Sentence 2: " + sentenceTwoWhy + " Percentage: " + sentenceTwoWhy / totalWhy); //System.Console.WriteLine("Sentence 3: " + sentenceThreeWhy + " Percentage: " + sentenceThreeWhy / totalWhy); //System.Console.WriteLine("Sentence 4: " + sentenceFourWhy + " Percentage: " + sentenceFourWhy / totalWhy); //System.Console.WriteLine("Sentence >= 5: " + sentenceFiveWhy + " Percentage: " + sentenceFiveWhy / totalWhy); } else { MessageBox.Show("Invalid XML File!"); return(false); } Identifier annotationIdentifier = new Identifier(false, null); for (int nI = 0; nI < listCurrentArticles.Count; nI++) { annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]); annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]); annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]); annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]); annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]); annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]); annotationIdentifier.setTitle(listCurrentArticles[nI].Title); annotationIdentifier.labelAnnotations(); listAllWhoAnnotations.Add(annotationIdentifier.getWho()); listAllWhenAnnotations.Add(annotationIdentifier.getWhen()); listAllWhereAnnotations.Add(annotationIdentifier.getWhere()); listAllWhatAnnotations.Add(annotationIdentifier.getWhat()); listAllWhyAnnotations.Add(annotationIdentifier.getWhy()); } ResultWriter rw = new ResultWriter(destinationPath, invertedDestinationPath, formatDateDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations); rw.generateOutput(); rw.generateOutputFormatDate(); rw.generateInvertedIndexOutput(); return(true); }
public static void Main() { /*#if DEBUG Application.EnableVisualStyles(); Application.SetCompatibleTextRenderingDefault(false); Application.Run(new Main()); #else*/ Boolean isAnnotated = true; FileParser fileparserFP = new FileParser(); String sourcePath = @"..\..\training_news.xml"; String destinationPath = @"..\..\result.xml"; String invertedDestinationPath = @"..\..\result_inverted_index.xml"; String formatDateDestinationPath = @"..\..\result_format_date.xml"; List<Article> listCurrentArticles = fileparserFP.parseFile(sourcePath); List<Annotation> listCurrentTrainingAnnotations = new List<Annotation>(); if (isAnnotated) { listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath); } List<List<Token>> listTokenizedArticles = new List<List<Token>>(); List<List<Candidate>> listAllWhoCandidates = new List<List<Candidate>>(); List<List<Candidate>> listAllWhenCandidates = new List<List<Candidate>>(); List<List<Candidate>> listAllWhereCandidates = new List<List<Candidate>>(); List<List<List<Token>>> listAllWhatCandidates = new List<List<List<Token>>>(); List<List<List<Token>>> listAllWhyCandidates = new List<List<List<Token>>>(); List<List<String>> listAllWhoAnnotations = new List<List<String>>(); List<List<String>> listAllWhenAnnotations = new List<List<String>>(); List<List<String>> listAllWhereAnnotations = new List<List<String>>(); List<String> listAllWhatAnnotations = new List<String>(); List<String> listAllWhyAnnotations = new List<String>(); Preprocessor preprocessor = new Preprocessor(); if (listCurrentArticles != null && listCurrentArticles.Count > 0 && (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 && listCurrentArticles.Count == listCurrentTrainingAnnotations.Count))) { //Temporarily set to 2 because getting all articles takes longer run time for (int nI = 0; nI < listCurrentArticles.Count; nI++) { preprocessor.setCurrentArticle(listCurrentArticles[nI]); preprocessor.preprocess(); if (isAnnotated) { preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); preprocessor.performAnnotationAssignment(); } listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle()); listAllWhoCandidates.Add(preprocessor.getWhoCandidates()); listAllWhenCandidates.Add(preprocessor.getWhenCandidates()); listAllWhereCandidates.Add(preprocessor.getWhereCandidates()); listAllWhatCandidates.Add(preprocessor.getWhatCandidates()); listAllWhyCandidates.Add(preprocessor.getWhyCandidates()); } if (isAnnotated) { /*Trainer trainer = new Trainer(); trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates); trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates); trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/ } } #region Candidate Selection Printer /*Candidate Selection Printer*/ /*try { var whoCandidatesPath = @"..\..\candidates_who.txt"; var whenCandidatesPath = @"..\..\candidates_when.txt"; var whereCandidatesPath = @"..\..\candidates_where.txt"; if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath); if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath); if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath); using (StreamWriter sw = File.CreateText(whoCandidatesPath)) { for (int nI = 0; nI < listAllWhoCandidates.Count; nI++) { sw.WriteLine("#{0}:", nI); foreach (var candidate in listAllWhoCandidates[nI]) { sw.Write(candidate.Value + ", "); } sw.WriteLine("\n"); } } using (StreamWriter sw = File.CreateText(whenCandidatesPath)) { for (int nI = 0; nI < listAllWhenCandidates.Count; nI++) { sw.WriteLine("#{0}:", nI); foreach (var candidate in listAllWhenCandidates[nI]) { sw.Write(candidate.Value + ", "); } sw.WriteLine("\n"); } } using (StreamWriter sw = File.CreateText(whereCandidatesPath)) { for (int nI = 0; nI < listAllWhereCandidates.Count; nI++) { sw.WriteLine("#{0}:", nI); foreach (var candidate in listAllWhereCandidates[nI]) { sw.Write(candidate.Value + ", "); } sw.WriteLine("\n"); } } } catch (Exception e) { System.Console.WriteLine("Error with writing initial line of training dataset."); }*/ #endregion WhatWhyTrainer wwt = new WhatWhyTrainer(); wwt.startTrain(); Identifier annotationIdentifier = new Identifier(isAnnotated, wwt); for (int nI = 0; nI < listCurrentArticles.Count; nI++) { annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]); annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]); annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]); annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]); annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]); annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]); annotationIdentifier.setTitle(listCurrentArticles[nI].Title); if (isAnnotated) { annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); } annotationIdentifier.labelAnnotations(); listAllWhoAnnotations.Add(annotationIdentifier.getWho()); listAllWhenAnnotations.Add(annotationIdentifier.getWhen()); listAllWhereAnnotations.Add(annotationIdentifier.getWhere()); listAllWhatAnnotations.Add(annotationIdentifier.getWhat()); listAllWhyAnnotations.Add(annotationIdentifier.getWhy()); } wwt.endTrain(); /*ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations); rw.generateOutput(); rw.generateOutputFormatDate(); rw.generateInvertedIndexOutput();*/ //#endif }
private void labelWhy() { double WEIGHT_PER_MARKER = 0.5; double WEIGHT_PER_WHAT = 0.5; double CARRY_OVER = 0; String[][] markers = new String[][] { new String[] { " sanhi sa ", "START" }, new String[] { " sanhi ng ", "START" }, new String[] { " sapagkat ", "START" }, new String[] { " palibhasa ay ", "START" }, new String[] { " palibhasa ", "START" }, new String[] { " kasi ", "START" }, new String[] { " mangyari'y ", "START" }, new String[] { " mangyari ay ", "START" }, new String[] { " dahil sa ", "START" }, new String[] { " dahil na rin sa ", "START" }, new String[] { " dahil ", "START" }, new String[] { " dahilan sa", "START" }, new String[] { " dahilan ", "START" }, new String[] { " para ", "START" }, new String[] { " upang ", "START" }, new String[] { " makaraang ", "START" }, new String[] { " naglalayong ", "START" }, new String[] { " kaya ", "END" } }; List<double> candidateWeights = new List<double>(); if (listWhyCandidates.Count > 0) { bool foundMatching = false; foreach (List<Token> candidate in listWhyCandidates) { String tempWhy = ""; String copyWhy = ""; double tempWeight = 0; String[] match; tempWhy = String.Join(" ", candidate.Select(token => token.Value).ToArray()); tempWhy = tempWhy.Replace("-LRB- ", "("); tempWhy = tempWhy.Replace(" -RRB-", ")"); tempWhy = tempWhy.Replace(" . ", "."); tempWhy = tempWhy.Replace(" .", "."); tempWhy = tempWhy.Replace(" ,", ","); tempWhy = tempWhy.Replace(" !", "!"); copyWhy = tempWhy; if (tempWhy.Contains(strWhat)) { tempWeight += WEIGHT_PER_WHAT; } match = markers.FirstOrDefault(s => tempWhy.Contains(s[0])); if (match != null) { tempWhy = (match[1].Equals("START")) ? tempWhy.Substring(tempWhy.IndexOf(match[0]) + match[0].Count()) : tempWhy.Substring(0, tempWhy.IndexOf(match[0])); tempWeight += WEIGHT_PER_MARKER; } tempWeight += CARRY_OVER; CARRY_OVER = 0; if (strWhat.Contains(tempWhy)) { tempWeight = 0; } if (strWhat.Equals(tempWhy)) { CARRY_OVER = 0.5; } int position = candidate[0].Position + copyWhy.Substring(0, copyWhy.IndexOf(tempWhy)).Split(' ').Count() - 1; int length = tempWhy.Split(' ').Count(); Candidate newCandidate = new Candidate(tempWhy, position, length); newCandidate.Sentence = candidate[0].Sentence; newCandidate.Score = tempWeight; newCandidate.NumWho = listWho.Where(tempWhy.Contains).Count(); newCandidate.NumWhen = listWhen.Where(tempWhy.Contains).Count(); newCandidate.NumWhere = listWhere.Where(tempWhy.Contains).Count(); if (isAnnotated) { Regex rgx = new Regex("[^a-zA-Z0-9]"); var candidateValue = rgx.Replace(newCandidate.Value, ""); var annotationValue = rgx.Replace(annotationCurrent.Why, ""); if (candidateValue == annotationValue) { newCandidate.IsWhy = true; foundMatching = true; } } listSecondaryWhyCandidates.Add(newCandidate); } if (isAnnotated && !foundMatching && annotationCurrent.Why.Length > 0) { Preprocessor p = new Preprocessor(); List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.Why); Candidate newCandidate = new Candidate( annotationCurrent.Why, 0, annotationCurrent.Why.Split(' ').Count() ); int sentenceNumber = -1; int position = -1; for (int i = 0; i < articleCurrent.Count - 2; i++) { if (tokenizedAnnotation[0].Value == articleCurrent[i].Value && (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) && (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value)) { sentenceNumber = articleCurrent[i].Sentence; position = articleCurrent[i].Position; break; } } if (sentenceNumber != -1 && position != -1) { double tempWeight = 0; if (annotationCurrent.Why.Contains(annotationCurrent.What)) { tempWeight += WEIGHT_PER_WHAT; } String[] match = markers.FirstOrDefault(s => annotationCurrent.Why.Contains(s[0])); if (match != null) { tempWeight += WEIGHT_PER_MARKER; } tempWeight += CARRY_OVER; CARRY_OVER = 0; if (annotationCurrent.What.Contains(annotationCurrent.Why)) { tempWeight = 0; } newCandidate.Position = position; newCandidate.Sentence = sentenceNumber; newCandidate.Score = tempWeight; newCandidate.NumWho = listWho.Where(annotationCurrent.Why.Contains).Count(); newCandidate.NumWhen = listWhen.Where(annotationCurrent.Why.Contains).Count(); newCandidate.NumWhere = listWhere.Where(annotationCurrent.Why.Contains).Count(); listSecondaryWhyCandidates.Add(newCandidate); } } if (isAnnotated) { wwt.train("why", articleCurrent, listSecondaryWhyCandidates); } listSecondaryWhyCandidates = new List<Candidate>(); } /*Instances whyInstances = createWhyInstances(); foreach (Instance instance in whyInstances) { double[] classProbability = whyClassifier.distributionForInstance(instance); if (classProbability[0] >= classProbability[1]) { strWhy = instance.stringValue(0); break; } }*/ }
private void labelWhat() { double WEIGHT_PER_WHO = 0.3; double WEIGHT_PER_WHEN = 0.2; double WEIGHT_PER_WHERE = 0.2; double WEIGHT_PER_SENTENCE = 0.2; double WEIGHT_PER_W_IN_TITLE = 0.1; List<double> candidateWeights = new List<double>(); double highestWeight = -1; String[][] markers = new String[][] { new String[] { "kaya", "START" }, new String[] { "para", "END" }, new String[] { "dahil", "END" }, new String[] { "upang", "END" }, new String[] { "makaraang", "END" }, }; if (listWhatCandidates.Count > 0) { bool foundMatching = false; foreach (List<Token> candidate in listWhatCandidates) { String tempWhat = ""; String copyWhat = ""; double tempWeight = 0; String[] match; tempWhat = String.Join(" ", candidate.Select(token => token.Value).ToArray()); tempWhat = tempWhat.Replace("-LRB- ", "("); tempWhat = tempWhat.Replace(" -RRB-", ")"); tempWhat = tempWhat.Replace(" . ", "."); tempWhat = tempWhat.Replace(" .", "."); tempWhat = tempWhat.Replace(" ,", ","); tempWhat = tempWhat.Replace(" !", "!"); copyWhat = tempWhat; tempWeight += listWho.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHO; tempWeight += listWhen.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHEN; tempWeight += listWhere.Where(tempWhat.Contains).Count() * WEIGHT_PER_WHERE; tempWeight += 1 - WEIGHT_PER_SENTENCE * candidate[0].Sentence; tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; candidateWeights.Add(tempWeight); match = markers.FirstOrDefault(s => tempWhat.Contains(s[0])); if (match != null) { tempWhat = (match[1].Equals("START")) ? tempWhat.Substring(tempWhat.IndexOf(match[0]) + match[0].Count() + 1) : tempWhat.Substring(0, tempWhat.IndexOf(match[0])); } int position = candidate[0].Position + copyWhat.Substring(0, copyWhat.IndexOf(tempWhat)).Split(' ').Count() - 1; int length = tempWhat.Split(' ').Count(); Candidate newCandidate = new Candidate(tempWhat, position, length); newCandidate.Sentence = candidate[0].Sentence; newCandidate.Score = tempWeight; newCandidate.NumWho = listWho.Where(tempWhat.Contains).Count(); newCandidate.NumWhen = listWhen.Where(tempWhat.Contains).Count(); newCandidate.NumWhere = listWhere.Where(tempWhat.Contains).Count(); if (isAnnotated) { Regex rgx = new Regex("[^a-zA-Z0-9]"); var candidateValue = rgx.Replace(newCandidate.Value, ""); var annotationValue = rgx.Replace(annotationCurrent.What, ""); if (candidateValue == annotationValue) { newCandidate.IsWhat = true; foundMatching = true; } } listSecondaryWhatCandidates.Add(newCandidate); } if (isAnnotated && !foundMatching && annotationCurrent.What.Length > 0) { Preprocessor p = new Preprocessor(); List<Token> tokenizedAnnotation = p.performTokenizationAndSS(annotationCurrent.What); Candidate newCandidate = new Candidate( annotationCurrent.What, 0, annotationCurrent.What.Split(' ').Count() ); int sentenceNumber = -1; int position = -1; for (int i = 0; i < articleCurrent.Count-2; i++) { if (tokenizedAnnotation[0].Value == articleCurrent[i].Value && (tokenizedAnnotation.Count < 2 || tokenizedAnnotation[1].Value == articleCurrent[i + 1].Value) && (tokenizedAnnotation.Count < 3 || tokenizedAnnotation[2].Value == articleCurrent[i + 2].Value)) { sentenceNumber = articleCurrent[i].Sentence; position = articleCurrent[i].Position; break; } } if (sentenceNumber != -1 && position != -1) { double tempWeight = 0; tempWeight += listWho.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHO; tempWeight += listWhen.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHEN; tempWeight += listWhere.Where(annotationCurrent.What.Contains).Count() * WEIGHT_PER_WHERE; tempWeight += 1 - WEIGHT_PER_SENTENCE * sentenceNumber; tempWeight += listWho.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; tempWeight += listWhen.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; tempWeight += listWhere.Where(titleCurrent.Contains).Count() * WEIGHT_PER_W_IN_TITLE; newCandidate.Position = position; newCandidate.Sentence = sentenceNumber; newCandidate.Score = tempWeight; newCandidate.NumWho = listWho.Where(annotationCurrent.What.Contains).Count(); newCandidate.NumWhen = listWhen.Where(annotationCurrent.What.Contains).Count(); newCandidate.NumWhere = listWhere.Where(annotationCurrent.What.Contains).Count(); listSecondaryWhatCandidates.Add(newCandidate); } } if (isAnnotated) { wwt.train("what", articleCurrent, listSecondaryWhatCandidates); } listSecondaryWhatCandidates = new List<Candidate>(); /*Instances whatInstances = createWhatInstances(); foreach (Instance instance in whatInstances) { double[] classProbability = whatClassifier.distributionForInstance(instance); if (classProbability[0] >= classProbability[1]) { strWhat = instance.stringValue(0); break; } }*/ } }
public static void Main() { #if DEBUG Application.EnableVisualStyles(); Application.SetCompatibleTextRenderingDefault(false); Application.Run(new Main()); #else Boolean isAnnotated = true; FileParser fileparserFP = new FileParser(); String sourcePath = @"..\..\training_news.xml"; String destinationPath = @"..\..\result.xml"; String invertedDestinationPath = @"..\..\result_inverted_index.xml"; String formatDateDestinationPath = @"..\..\result_format_date.xml"; List <Article> listCurrentArticles = fileparserFP.parseFile(sourcePath); List <Annotation> listCurrentTrainingAnnotations = new List <Annotation>(); if (isAnnotated) { listCurrentTrainingAnnotations = fileparserFP.parseAnnotations(sourcePath); } List <List <Token> > listTokenizedArticles = new List <List <Token> >(); List <List <Candidate> > listAllWhoCandidates = new List <List <Candidate> >(); List <List <Candidate> > listAllWhenCandidates = new List <List <Candidate> >(); List <List <Candidate> > listAllWhereCandidates = new List <List <Candidate> >(); List <List <List <Token> > > listAllWhatCandidates = new List <List <List <Token> > >(); List <List <List <Token> > > listAllWhyCandidates = new List <List <List <Token> > >(); List <List <String> > listAllWhoAnnotations = new List <List <String> >(); List <List <String> > listAllWhenAnnotations = new List <List <String> >(); List <List <String> > listAllWhereAnnotations = new List <List <String> >(); List <String> listAllWhatAnnotations = new List <String>(); List <String> listAllWhyAnnotations = new List <String>(); Preprocessor preprocessor = new Preprocessor(); if (listCurrentArticles != null && listCurrentArticles.Count > 0 && (!isAnnotated || (listCurrentTrainingAnnotations != null && listCurrentTrainingAnnotations.Count > 0 && listCurrentArticles.Count == listCurrentTrainingAnnotations.Count))) { //Temporarily set to 2 because getting all articles takes longer run time for (int nI = 0; nI < listCurrentArticles.Count; nI++) { preprocessor.setCurrentArticle(listCurrentArticles[nI]); preprocessor.preprocess(); if (isAnnotated) { preprocessor.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); preprocessor.performAnnotationAssignment(); } listTokenizedArticles.Add(preprocessor.getLatestTokenizedArticle()); listAllWhoCandidates.Add(preprocessor.getWhoCandidates()); listAllWhenCandidates.Add(preprocessor.getWhenCandidates()); listAllWhereCandidates.Add(preprocessor.getWhereCandidates()); listAllWhatCandidates.Add(preprocessor.getWhatCandidates()); listAllWhyCandidates.Add(preprocessor.getWhyCandidates()); } if (isAnnotated) { /*Trainer trainer = new Trainer(); * trainer.trainMany("who", listTokenizedArticles, listAllWhoCandidates); * trainer.trainMany("when", listTokenizedArticles, listAllWhenCandidates); * trainer.trainMany("where", listTokenizedArticles, listAllWhereCandidates);*/ } } #region Candidate Selection Printer /*Candidate Selection Printer*/ /*try * { * var whoCandidatesPath = @"..\..\candidates_who.txt"; * var whenCandidatesPath = @"..\..\candidates_when.txt"; * var whereCandidatesPath = @"..\..\candidates_where.txt"; * * if (File.Exists(whoCandidatesPath)) File.Delete(whoCandidatesPath); * if (File.Exists(whenCandidatesPath)) File.Delete(whenCandidatesPath); * if (File.Exists(whereCandidatesPath)) File.Delete(whereCandidatesPath); * * using (StreamWriter sw = File.CreateText(whoCandidatesPath)) * { * for (int nI = 0; nI < listAllWhoCandidates.Count; nI++) * { * sw.WriteLine("#{0}:", nI); * foreach (var candidate in listAllWhoCandidates[nI]) * { * sw.Write(candidate.Value + ", "); * } * sw.WriteLine("\n"); * } * } * using (StreamWriter sw = File.CreateText(whenCandidatesPath)) * { * for (int nI = 0; nI < listAllWhenCandidates.Count; nI++) * { * sw.WriteLine("#{0}:", nI); * foreach (var candidate in listAllWhenCandidates[nI]) * { * sw.Write(candidate.Value + ", "); * } * sw.WriteLine("\n"); * } * } * using (StreamWriter sw = File.CreateText(whereCandidatesPath)) * { * for (int nI = 0; nI < listAllWhereCandidates.Count; nI++) * { * sw.WriteLine("#{0}:", nI); * foreach (var candidate in listAllWhereCandidates[nI]) * { * sw.Write(candidate.Value + ", "); * } * sw.WriteLine("\n"); * } * } * } * catch (Exception e) * { * System.Console.WriteLine("Error with writing initial line of training dataset."); * }*/ #endregion WhyTrainer wt = new WhyTrainer(); if (isAnnotated) { wt.startTrain(); } Identifier annotationIdentifier = new Identifier(isAnnotated, wt); for (int nI = 0; nI < listCurrentArticles.Count; nI++) { annotationIdentifier.setCurrentArticle(listTokenizedArticles[nI]); annotationIdentifier.setWhoCandidates(listAllWhoCandidates[nI]); annotationIdentifier.setWhenCandidates(listAllWhenCandidates[nI]); annotationIdentifier.setWhereCandidates(listAllWhereCandidates[nI]); annotationIdentifier.setWhatCandidates(listAllWhatCandidates[nI]); annotationIdentifier.setWhyCandidates(listAllWhyCandidates[nI]); annotationIdentifier.setTitle(listCurrentArticles[nI].Title); if (isAnnotated) { annotationIdentifier.setCurrentAnnotation(listCurrentTrainingAnnotations[nI]); } annotationIdentifier.labelAnnotations(); listAllWhoAnnotations.Add(annotationIdentifier.getWho()); listAllWhenAnnotations.Add(annotationIdentifier.getWhen()); listAllWhereAnnotations.Add(annotationIdentifier.getWhere()); listAllWhatAnnotations.Add(annotationIdentifier.getWhat()); listAllWhyAnnotations.Add(annotationIdentifier.getWhy()); } if (isAnnotated) { wt.endTrain(); } ResultWriter rw = new ResultWriter(destinationPath, formatDateDestinationPath, invertedDestinationPath, listCurrentArticles, listAllWhoAnnotations, listAllWhenAnnotations, listAllWhereAnnotations, listAllWhatAnnotations, listAllWhyAnnotations); rw.generateOutput(); rw.generateOutputFormatDate(); rw.generateInvertedIndexOutput(); #endif }