public static void ProcessSummaries() { var summDir = @"D:\Tesis2016\DUC2001_Summarization_Documents\data\test\duplicate.summaries"; var summOutputDir = @"D:\Tesis2016\DUC2001_Summarization_Documents\data\testOutput\"; foreach (var targetDir in Directory.GetDirectories(summDir)) { var file = Directory.GetFiles(targetDir).Where(c => c.EndsWith("perdocs")).FirstOrDefault(); var str = "<roo>" + File.ReadAllText(file) + "</roo>"; var element = XElement.Parse(SanityXml.Sanity(str)); var sums = (from c in element.Elements("SUM") select c).ToList(); foreach (var sum in sums) { File.WriteAllText(Path.Combine(summOutputDir, sum.Attribute("DOCREF").Value) + ".txt", sum.Value.TrimStart().TrimEnd()); } //where c.Attribute("DOCREF").Value == item } }
public static void ProcessTraining() { var directories = Directory.GetDirectories(@"D:\Tesis2016\DUC2001_Summarization_Documents\data\training"); foreach (var directory in directories) { Dictionary <string, string> Files = new Dictionary <string, string>(); var subdirs = Directory.GetDirectories(directory); var textDir = subdirs.Where(c => c.EndsWith("docs")).First(); foreach (var file in Directory.GetFiles(textDir)) { var sb = new StringBuilder(); var file_name = Path.GetFileNameWithoutExtension(file); var element = XElement.Parse(SanityXml.Sanity(file_name, File.ReadAllText(file))); if (file_name.StartsWith("SJMN")) { var LEADPARA = (from c in element.Elements("LEADPARA") select c).First(); var TEXT = (from c in element.Elements("TEXT") select c).First(); sb.AppendLine(LEADPARA.Value); sb.AppendLine(TEXT.Value); } else if (file_name.StartsWith("WSJ")) { var TEXT = (from c in element.Elements("TEXT") select c).First(); sb.AppendLine(TEXT.Value); } else if (file_name.StartsWith("FBI")) { var TEXT = (from c in element.Elements("TEXT") select c).First(); sb.AppendLine(TEXT.Value); } else if (file_name.StartsWith("AP")) { var TEXT = (from c in element.Elements("TEXT") select c); foreach (var item in TEXT) { sb.AppendLine(item.Value); } } else if (file_name.StartsWith("LA")) { var TEXT = (from c in element.Elements("TEXT") select c).First(); sb.AppendLine(TEXT.Value.Replace("<P>", "").Replace("</P>", "")); } else if (file_name.StartsWith("FT")) { var TEXT = (from c in element.Elements("TEXT") select c).First(); sb.AppendLine(TEXT.Value); } else { throw new Exception("no pattern"); } Files[file_name] = file; File.WriteAllText(Path.Combine(DUC_Processed_Text, file_name) + ".txt", sb.ToString().TrimStart().TrimEnd()); } var sumDir = subdirs.Where(c => !c.EndsWith("docs")).First(); foreach (var file in Directory.GetFiles(sumDir).Where(c => c.EndsWith("perdocs"))) { var str = "<roo>" + File.ReadAllText(file) + "</roo>"; var element = XElement.Parse(SanityXml.Sanity(str)); foreach (var item in Files.Keys) { var sum = (from c in element.Elements("SUM") where c.Attribute("DOCREF").Value == item select c).FirstOrDefault(); if (sum == null) { File.Delete(Path.Combine(DUC_Processed_Text, item)); } else { File.WriteAllText(Path.Combine(DUC_Processed_Summaries, item) + ".txt", sum.Value.TrimStart().TrimEnd()); } } } } }