public Task Run() { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("ADD CREDIT TO TEXT FILES\n"); Console.ResetColor(); Console.WriteLine( $"Text dir: {_txtDir}\n" + $"Resp value: {_respValue}\n" + $"PersName value: {_persValue}\n" + $"Dry run: {_dry}\n"); ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("ADD CREDIT TO TEXT FILES"); foreach (string filePath in FileEnumerator.Enumerate(_txtDir, @"^[^-]+-[^-]+\.xml", true, true)) { Console.WriteLine(filePath); Log.Logger.Information(filePath); XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace); // TEI/teiHeader/fileDesc/seriesStmt/ XElement series = doc.Root ?.Element(XmlHelper.TEI + "teiHeader") ?.Element(XmlHelper.TEI + "fileDesc") ?.Element(XmlHelper.TEI + "seriesStmt"); if (series == null) { Log.Logger?.Error( $"Unable to find seriesStmt in header for {Path.GetFileName(filePath)}"); continue; } // <respStmt> // <resp key="MQDQ">RESPVALUE</resp> // <persName>PERSVALUE</persName> // </respStmt> series.Add(new XElement(XmlHelper.TEI + "respStmt", new XElement(XmlHelper.TEI + "resp", new XAttribute("key", "MQDQ"), _respValue), new XElement(XmlHelper.TEI + "persName", _persValue))); if (!_dry) { doc.Save(filePath); } } return(Task.CompletedTask); }
private static void Run() { var sourceFiles = new FileEnumerator(The.Workspace.ProjectDirectory); using (new DirectoryChanger(The.Workspace.ProjectDirectory)) { var files = sourceFiles.Enumerate(".cs"); foreach (var fileInfo in files) { Console.WriteLine("* " + fileInfo.Path); ProcessSourceFile(fileInfo.Path); } } using (new DirectoryChanger(The.Workspace.AssetsDirectory)) { var files = The.Workspace.AssetFiles.Enumerate(".tan"); foreach (var fileInfo in files) { Console.WriteLine("* " + fileInfo.Path); ProcessSourceFile(fileInfo.Path); } } }
public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("REMOVE OVERLAPS\n"); Console.ResetColor(); Console.WriteLine( $"Input: {_appFileMask}\n" + $"Output: {_outputDir}\n" + $"Div list: {(_writeDivList ? "yes" : "no")}\n"); int inputFileCount = 0; int removedCount = 0; ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("REMOVE OVERLAPS"); if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } HashSet <string> errDivIds = new HashSet <string>(); // for each app document WordIdList widList = new WordIdList { Logger = loggerFactory.CreateLogger("report-overlaps") }; foreach (string filePath in FileEnumerator.Enumerate( _appFileDir, _appFileMask, _regexMask, _recursive)) { Console.WriteLine(); Log.Logger.Information("Parsing {FilePath}", filePath); // load app document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine(filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); // collect word IDs from text document widList.Parse(XDocument.Load(filePath.Replace("-app.", "."))); // collect app's locations List <AppElemLocations> appElemLocs = AppElemLocationCollector.Collect(doc, widList, AppElemLocationCollector.IsOverlappable); // detect and process overlaps for (int i = 0; i < appElemLocs.Count - 1; i++) { for (int j = i + 1; j < appElemLocs.Count; j++) { if (appElemLocs[i].Overlaps(appElemLocs[j])) { // pick the target between the two overlapping app's AppElemLocations target, source; int targetIndex, sourceIndex; if (IsFirstTarget(appElemLocs[i], appElemLocs[j])) { target = appElemLocs[targetIndex = i]; source = appElemLocs[sourceIndex = j]; } else { source = appElemLocs[sourceIndex = i]; target = appElemLocs[targetIndex = j]; } Log.Logger.Information("Merging overlapping app " + $"{GetAttributesDump(source.Element)} into " + GetAttributesDump(target.Element)); // log error if the source had @wit/@source if (LemHasLostAttributes( source.Element.Element(XmlHelper.TEI + "lem"), target.Element.Element(XmlHelper.TEI + "lem"))) { string divId = source.Element.Ancestors( XmlHelper.TEI + "div1") .First() .Attribute(XmlHelper.XML + "id").Value; errDivIds.Add(divId); Log.Logger.Error("Removed overlapping app lost sources at div " + divId + ": " + GetAttributesDump(source.Element)); } // append content of source into target in XML, // excluding the lem child, and adding @n to each child string nValue = source.Element.Attribute("from").Value.Substring(1) + " " + source.Element.Attribute("to").Value.Substring(1); foreach (XElement child in source.Element.Elements() .Where(e => e.Name.LocalName != "lem")) { child.SetAttributeValue("n", nValue); target.Element.Add(child); } // remove source from XML and locs source.Element.Remove(); appElemLocs.RemoveAt(sourceIndex); removedCount++; // continue looking from overlaps from the first // of the two app's involved i = Math.Min(sourceIndex, targetIndex) - 1; goto nextOuter; } } // j nextOuter: if (i % 10 == 0) { Console.Write('.'); } } // i // save string path = Path.Combine(_outputDir, Path.GetFileName(filePath)); doc.Save(path, SaveOptions.OmitDuplicateNamespaces); } if (_writeDivList) { using (StreamWriter listWriter = new StreamWriter( Path.Combine(_outputDir, "overlap-err-divs.txt"), false, Encoding.UTF8)) { foreach (string id in errDivIds) { listWriter.WriteLine(id); } listWriter.Flush(); } } Console.WriteLine($"\nInput documents: {inputFileCount}"); Console.WriteLine($"Removed overlaps: {removedCount}"); return(Task.CompletedTask); }
public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("PARSE TEXT\n"); Console.ResetColor(); Console.WriteLine( $"Input dir: {_inputFileDir}\n" + $"Input mask: {_inputFileMask}\n" + $"Output dir: {_outputDir}\n" + $"Div IDs list: {_flagDivIdList ?? "(none)"}\n" + $"Max items per file: {_maxItemPerFile}\n"); ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("PARSE TEXT"); XmlTextParser parser = new XmlTextParser { Logger = loggerFactory.CreateLogger("parse-text") }; int inputFileCount = 0; int totalItemCount = 0; StreamWriter writer = null; if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } // load div IDs list if requested, prefixing and suffixing them // so that we are ready to find them in the item's title HashSet <string> flagDivIds = _flagDivIdList != null ? LoadDivIds(_flagDivIdList, "xml:id=", XmlHelper.CIT_SEPARATOR) : null; // for each input document foreach (string filePath in FileEnumerator.Enumerate( _inputFileDir, _inputFileMask, _regexMask)) { // load document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine("\n" + filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace); JsonSerializerSettings jsonSettings = new JsonSerializerSettings { ContractResolver = new DefaultContractResolver { NamingStrategy = new CamelCaseNamingStrategy() }, Formatting = Formatting.Indented }; // parse items int itemCount = 0, outputFileCount = 0; foreach (IItem item in parser.Parse( doc, Path.GetFileNameWithoutExtension(filePath))) { if (++itemCount % 10 == 0) { Console.Write('.'); } // set flag if required if (flagDivIds.Any(s => item.Title.IndexOf(s, StringComparison.Ordinal) > -1)) { item.Flags |= 1; } // create new output file if required if (writer == null || (_maxItemPerFile > 0 && itemCount > _maxItemPerFile)) { if (writer != null) { CloseOutputFile(writer); } string path = Path.Combine(_outputDir, $"{inputFileName}_{++outputFileCount:00000}.json"); writer = new StreamWriter(new FileStream(path, FileMode.Create, FileAccess.Write, FileShare.Read), Encoding.UTF8); writer.WriteLine("["); } // dump item into it string json = JsonConvert.SerializeObject( item, jsonSettings); // string json = JsonSerializer.Serialize(item, typeof(object), options); // this will output a , also for the last JSON array item, // but we don't care about it -- that's just a dump, and // it's easy to ignore/remove it if needed. writer.WriteLine(json + ","); } totalItemCount += itemCount; if (writer != null) { CloseOutputFile(writer); writer = null; } } Console.WriteLine($"\nInput documents: {inputFileCount}"); Console.WriteLine($"Output items: {totalItemCount}"); return(Task.CompletedTask); }
public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("REPORT OVERLAPS\n"); Console.ResetColor(); Console.WriteLine( $"Input: {_appFileMask}\n" + $"Output: {_outputPath}\n"); int inputFileCount = 0; int overlapCount = 0; ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("REPORT OVERLAPS"); using (StreamWriter writer = new StreamWriter(_outputPath, false, Encoding.UTF8)) { writer.WriteLine("# Overlaps Report"); writer.WriteLine(); writer.WriteLine($"Input: `{_appFileDir}{Path.DirectorySeparatorChar}{_appFileMask}`"); writer.WriteLine(); // for each app document WordIdList widList = new WordIdList { Logger = loggerFactory.CreateLogger("report-overlaps") }; foreach (string filePath in FileEnumerator.Enumerate( _appFileDir, _appFileMask, _regexMask, _recursive)) { Console.WriteLine(); Log.Logger.Information("Parsing {FilePath}", filePath); // load app document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine(filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); // collect word IDs from text document widList.Parse(XDocument.Load(filePath.Replace("-app.", "."))); // collect app's locations List <AppElemLocations> appElemLocs = AppElemLocationCollector.Collect(doc, widList, AppElemLocationCollector.IsOverlappable); // detect and report overlaps for (int i = 0; i < appElemLocs.Count - 1; i++) { for (int j = i + 1; j < appElemLocs.Count; j++) { if (appElemLocs[i].Overlaps(appElemLocs[j])) { writer.WriteLine($"## Overlap {++overlapCount}"); writer.WriteLine(); writer.WriteLine(Path.GetFileName(filePath) + $" at {appElemLocs[i].LineNumber}"); // text int n = 0; foreach (var iw in appElemLocs[i].Locations) { if (++n > 1) { writer.Write(' '); } writer.Write($"`{iw.Item1}`=`{iw.Item2}`"); } writer.WriteLine(); writer.WriteLine(); // app WriteAppXml(appElemLocs[i], writer); WriteAppXml(appElemLocs[j], writer); goto nextOuter; } } nextOuter: if (i % 10 == 0) { Console.Write('.'); } } Console.WriteLine(); } writer.Flush(); } Console.WriteLine($"\nInput documents: {inputFileCount}"); return(Task.CompletedTask); }
/// <summary> /// Runs this command. /// </summary> public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("PARTITION\n"); Console.ResetColor(); Console.WriteLine( $"Input dir: {_inputDir}\n" + $"Input mask: {_fileMask}\n" + $"Output dir: {_outputDir}\n" + $"Min: {_minTreshold}\n" + $"Max: {_maxTreshold}\n" + $"Recursive: {_recursive}\n"); Log.Logger.Information("PARTITION"); XmlPartitioner partitioner = new XmlPartitioner { MinTreshold = _minTreshold, MaxTreshold = _maxTreshold }; int partitioned = 0, total = 0; if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } foreach (string filePath in FileEnumerator.Enumerate( _inputDir, _fileMask, _regexMask, _recursive)) { total++; Console.Write(filePath); XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace); bool touched = partitioner.Partition(doc, Path.GetFileNameWithoutExtension(filePath)); string outputPath = Path.Combine(_outputDir, Path.GetFileName(filePath)); if (touched) { partitioned++; Console.WriteLine($" => {outputPath}"); if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } doc.Save(outputPath, SaveOptions.OmitDuplicateNamespaces); } else { File.Copy(filePath, outputPath); Console.WriteLine(); } } Console.WriteLine($"Total files: {total}"); Console.WriteLine($"Partitioned files: {partitioned}"); return(Task.CompletedTask); }
public Task Run() { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("IMPORT JSON TEXT AND APPARATUS\n"); Console.ResetColor(); Console.WriteLine( $"Text dir: {_txtFileDir}\n" + $"Text mask: {_txtFileMask}\n" + $"Apparatus dir: {_appFileDir}\n" + $"Profile file: {_profilePath}\n" + $"Database: {_database}\n" + $"Dry run: {_dry}\n"); ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("IMPORT JSON TEXT AND APPARATUS"); if (!_dry) { // create database if not exists string connection = string.Format(CultureInfo.InvariantCulture, _config.GetConnectionString("Mongo"), _database); IDatabaseManager manager = new MongoDatabaseManager(); string profileContent = LoadProfile(_profilePath); IDataProfileSerializer serializer = new JsonDataProfileSerializer(); DataProfile profile = serializer.Read(profileContent); if (!manager.DatabaseExists(connection)) { Console.WriteLine("Creating database..."); Log.Information($"Creating database {_database}..."); manager.CreateDatabase(connection, profile); Console.WriteLine("Database created."); Log.Information("Database created."); } } else { if (!File.Exists(_profilePath)) { string error = "Profile path not found: " + _profilePath; Console.WriteLine(error); Log.Error(error); return(Task.CompletedTask); } } ICadmusRepository repository = _repositoryService.CreateRepository(_database); JsonImporter importer = new JsonImporter(repository) { Logger = loggerFactory.CreateLogger("json-importer"), IsDry = _dry }; int inputFileCount = 0; // 1) import text string[] files = FileEnumerator.Enumerate( _txtFileDir, _txtFileMask, _regexMask).ToArray(); HashSet <string> fileNames = new HashSet <string>(); Console.WriteLine($"Importing text from {files.Length} file(s)..."); foreach (string txtFilePath in files) { fileNames.Add( StripFileNameNr( Path.GetFileNameWithoutExtension(txtFilePath))); Console.WriteLine(txtFilePath); inputFileCount++; using (Stream stream = new FileStream(txtFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) { importer.ImportText(stream); } } // 2) import apparatus Console.WriteLine("Importing apparatus..."); foreach (string fileName in fileNames) { Console.WriteLine(fileName); foreach (string appFilePath in Directory.EnumerateFiles( _appFileDir, fileName + "-app_*.json")) { Console.WriteLine(" " + appFilePath); using (Stream stream = new FileStream(appFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) { importer.ImportApparatus(stream); } } } return(Task.CompletedTask); }
public Task Run() { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("PARSE APPARATUS\n"); Console.ResetColor(); Console.WriteLine( $"Input: {_inputFileMask}\n" + $"Output: {_outputDir}\n" + $"Max items per file: {_maxItemPerFile}\n"); ILoggerFactory loggerFactory = new LoggerFactory(); loggerFactory.AddSerilog(Log.Logger); Log.Logger.Information("PARSE APPARATUS"); XmlApparatusParser parser = new XmlApparatusParser { Logger = loggerFactory.CreateLogger("parse-app") }; int inputFileCount = 0; int totalPartCount = 0; StreamWriter writer = null; if (!Directory.Exists(_outputDir)) { Directory.CreateDirectory(_outputDir); } // for each input document foreach (string filePath in FileEnumerator.Enumerate( _inputFileDir, _inputFileMask, _regexMask, _recursive)) { Console.WriteLine(); Log.Logger.Information("Parsing {FilePath}", filePath); // load document string inputFileName = Path.GetFileNameWithoutExtension(filePath); Console.WriteLine(filePath); inputFileCount++; XDocument doc = XDocument.Load(filePath, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); JsonSerializerSettings jsonSettings = new JsonSerializerSettings { ContractResolver = new DefaultContractResolver { NamingStrategy = new CamelCaseNamingStrategy() }, Formatting = Formatting.Indented }; // load index string textFileName = inputFileName.Replace("-app", ""); LoadTextIndex(textFileName); // parse int partCount = 0, outputFileCount = 0; foreach (var part in parser.Parse(doc, textFileName, _textIndex)) { if (++partCount % 10 == 0) { Console.Write('.'); } // create new output file if required if (writer == null || (_maxItemPerFile > 0 && partCount > _maxItemPerFile)) { if (writer != null) { CloseOutputFile(writer); } string path = Path.Combine(_outputDir, $"{inputFileName}_{++outputFileCount:00000}.json"); writer = new StreamWriter(new FileStream(path, FileMode.Create, FileAccess.Write, FileShare.Read), Encoding.UTF8); writer.WriteLine("["); } // dump part into it string json = JsonConvert.SerializeObject(part, jsonSettings); writer.WriteLine(json + ","); } totalPartCount += partCount; if (writer != null) { CloseOutputFile(writer); writer = null; } } Console.WriteLine($"\nInput documents: {inputFileCount}"); Console.WriteLine($"Output parts: {totalPartCount}"); return(Task.CompletedTask); }