Пример #1
0
        public static void ProcessSummaries()
        {
            var summDir       = @"D:\Tesis2016\DUC2001_Summarization_Documents\data\test\duplicate.summaries";
            var summOutputDir = @"D:\Tesis2016\DUC2001_Summarization_Documents\data\testOutput\";

            foreach (var targetDir in Directory.GetDirectories(summDir))
            {
                var file = Directory.GetFiles(targetDir).Where(c => c.EndsWith("perdocs")).FirstOrDefault();

                var str     = "<roo>" + File.ReadAllText(file) + "</roo>";
                var element = XElement.Parse(SanityXml.Sanity(str));
                var sums    = (from c in element.Elements("SUM")
                               select c).ToList();

                foreach (var sum in sums)
                {
                    File.WriteAllText(Path.Combine(summOutputDir, sum.Attribute("DOCREF").Value) + ".txt", sum.Value.TrimStart().TrimEnd());
                }

                //where c.Attribute("DOCREF").Value == item
            }
        }
Пример #2
0
        public static void ProcessTraining()
        {
            var directories = Directory.GetDirectories(@"D:\Tesis2016\DUC2001_Summarization_Documents\data\training");

            foreach (var directory in directories)
            {
                Dictionary <string, string> Files = new Dictionary <string, string>();

                var subdirs = Directory.GetDirectories(directory);
                var textDir = subdirs.Where(c => c.EndsWith("docs")).First();

                foreach (var file in Directory.GetFiles(textDir))
                {
                    var sb        = new StringBuilder();
                    var file_name = Path.GetFileNameWithoutExtension(file);
                    var element   = XElement.Parse(SanityXml.Sanity(file_name, File.ReadAllText(file)));
                    if (file_name.StartsWith("SJMN"))
                    {
                        var LEADPARA = (from c in element.Elements("LEADPARA")
                                        select c).First();
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c).First();

                        sb.AppendLine(LEADPARA.Value);
                        sb.AppendLine(TEXT.Value);
                    }
                    else if (file_name.StartsWith("WSJ"))
                    {
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c).First();
                        sb.AppendLine(TEXT.Value);
                    }
                    else if (file_name.StartsWith("FBI"))
                    {
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c).First();

                        sb.AppendLine(TEXT.Value);
                    }
                    else if (file_name.StartsWith("AP"))
                    {
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c);

                        foreach (var item in TEXT)
                        {
                            sb.AppendLine(item.Value);
                        }
                    }
                    else if (file_name.StartsWith("LA"))
                    {
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c).First();

                        sb.AppendLine(TEXT.Value.Replace("<P>", "").Replace("</P>", ""));
                    }
                    else if (file_name.StartsWith("FT"))
                    {
                        var TEXT = (from c in element.Elements("TEXT")
                                    select c).First();

                        sb.AppendLine(TEXT.Value);
                    }
                    else
                    {
                        throw new Exception("no pattern");
                    }

                    Files[file_name] = file;
                    File.WriteAllText(Path.Combine(DUC_Processed_Text, file_name) + ".txt", sb.ToString().TrimStart().TrimEnd());
                }
                var sumDir = subdirs.Where(c => !c.EndsWith("docs")).First();

                foreach (var file in Directory.GetFiles(sumDir).Where(c => c.EndsWith("perdocs")))
                {
                    var str     = "<roo>" + File.ReadAllText(file) + "</roo>";
                    var element = XElement.Parse(SanityXml.Sanity(str));

                    foreach (var item in Files.Keys)
                    {
                        var sum = (from c in element.Elements("SUM")
                                   where c.Attribute("DOCREF").Value == item
                                   select c).FirstOrDefault();

                        if (sum == null)
                        {
                            File.Delete(Path.Combine(DUC_Processed_Text, item));
                        }
                        else
                        {
                            File.WriteAllText(Path.Combine(DUC_Processed_Summaries, item) + ".txt", sum.Value.TrimStart().TrimEnd());
                        }
                    }
                }
            }
        }