public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("REMOVE OVERLAPS\n"); Console.ResetColor(); Console.WriteLine( $"Input: {_appFileMask}\n" + $"Output: {_outputDir}\n" + $"Div list: {(_writeDivList ? "yes" : "no")}\n"); int inputFileCount = 0; int removedCount = 0; ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("REMOVE OVERLAPS"); if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } HashSet <string> errDivIds = new HashSet <string>(); // for each app document WordIdList widList = new WordIdList { Logger = loggerFactory.CreateLogger("report-overlaps") }; foreach (string filePath in FileEnumerator.Enumerate( _appFileDir, _appFileMask, _regexMask, _recursive)) { Console.WriteLine(); Log.Logger.Information("Parsing {FilePath}", filePath); // load app document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine(filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); // collect word IDs from text document widList.Parse(XDocument.Load(filePath.Replace("-app.", "."))); // collect app's locations List <AppElemLocations> appElemLocs = AppElemLocationCollector.Collect(doc, widList, AppElemLocationCollector.IsOverlappable); // detect and process overlaps for (int i = 0; i < appElemLocs.Count - 1; i++) { for (int j = i + 1; j < appElemLocs.Count; j++) { if (appElemLocs[i].Overlaps(appElemLocs[j])) { // pick the target between the two overlapping app's AppElemLocations target, source; int targetIndex, sourceIndex; if (IsFirstTarget(appElemLocs[i], appElemLocs[j])) { target = appElemLocs[targetIndex = i]; source = appElemLocs[sourceIndex = j]; } else { source = appElemLocs[sourceIndex = i]; target = appElemLocs[targetIndex = j]; } Log.Logger.Information("Merging overlapping app " + $"{GetAttributesDump(source.Element)} into " + GetAttributesDump(target.Element)); // log error if the source had @wit/@source if (LemHasLostAttributes( source.Element.Element(XmlHelper.TEI + "lem"), target.Element.Element(XmlHelper.TEI + "lem"))) { string divId = source.Element.Ancestors( XmlHelper.TEI + "div1") .First() .Attribute(XmlHelper.XML + "id").Value; errDivIds.Add(divId); Log.Logger.Error("Removed overlapping app lost sources at div " + divId + ": " + GetAttributesDump(source.Element)); } // append content of source into target in XML, // excluding the lem child, and adding @n to each child string nValue = source.Element.Attribute("from").Value.Substring(1) + " " + source.Element.Attribute("to").Value.Substring(1); foreach (XElement child in source.Element.Elements() .Where(e => e.Name.LocalName != "lem")) { child.SetAttributeValue("n", nValue); target.Element.Add(child); } // remove source from XML and locs source.Element.Remove(); appElemLocs.RemoveAt(sourceIndex); removedCount++; // continue looking from overlaps from the first // of the two app's involved i = Math.Min(sourceIndex, targetIndex) - 1; goto nextOuter; } } // j nextOuter: if (i % 10 == 0) { Console.Write('.'); } } // i // save string path = Path.Combine(_outputDir, Path.GetFileName(filePath)); doc.Save(path, SaveOptions.OmitDuplicateNamespaces); } if (_writeDivList) { using (StreamWriter listWriter = new StreamWriter( Path.Combine(_outputDir, "overlap-err-divs.txt"), false, Encoding.UTF8)) { foreach (string id in errDivIds) { listWriter.WriteLine(id); } listWriter.Flush(); } } Console.WriteLine($"\nInput documents: {inputFileCount}"); Console.WriteLine($"Removed overlaps: {removedCount}"); return(Task.CompletedTask); }
public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("REPORT OVERLAPS\n"); Console.ResetColor(); Console.WriteLine( $"Input: {_appFileMask}\n" + $"Output: {_outputPath}\n"); int inputFileCount = 0; int overlapCount = 0; ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("REPORT OVERLAPS"); using (StreamWriter writer = new StreamWriter(_outputPath, false, Encoding.UTF8)) { writer.WriteLine("# Overlaps Report"); writer.WriteLine(); writer.WriteLine($"Input: `{_appFileDir}{Path.DirectorySeparatorChar}{_appFileMask}`"); writer.WriteLine(); // for each app document WordIdList widList = new WordIdList { Logger = loggerFactory.CreateLogger("report-overlaps") }; foreach (string filePath in FileEnumerator.Enumerate( _appFileDir, _appFileMask, _regexMask, _recursive)) { Console.WriteLine(); Log.Logger.Information("Parsing {FilePath}", filePath); // load app document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine(filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); // collect word IDs from text document widList.Parse(XDocument.Load(filePath.Replace("-app.", "."))); // collect app's locations List <AppElemLocations> appElemLocs = AppElemLocationCollector.Collect(doc, widList, AppElemLocationCollector.IsOverlappable); // detect and report overlaps for (int i = 0; i < appElemLocs.Count - 1; i++) { for (int j = i + 1; j < appElemLocs.Count; j++) { if (appElemLocs[i].Overlaps(appElemLocs[j])) { writer.WriteLine($"## Overlap {++overlapCount}"); writer.WriteLine(); writer.WriteLine(Path.GetFileName(filePath) + $" at {appElemLocs[i].LineNumber}"); // text int n = 0; foreach (var iw in appElemLocs[i].Locations) { if (++n > 1) { writer.Write(' '); } writer.Write($"`{iw.Item1}`=`{iw.Item2}`"); } writer.WriteLine(); writer.WriteLine(); // app WriteAppXml(appElemLocs[i], writer); WriteAppXml(appElemLocs[j], writer); goto nextOuter; } } nextOuter: if (i % 10 == 0) { Console.Write('.'); } } Console.WriteLine(); } writer.Flush(); } Console.WriteLine($"\nInput documents: {inputFileCount}"); return(Task.CompletedTask); }