public void LoadFromTextCopiedOfPdf(string text) { // Reading all codes that exist in the text, so that we can compare with // the list of processed codes to know if any is missing at the end. var codigosExistentes = new HashSet <string>(); var matchCodigos = Regex.Matches(text, @"\d\s?\.\s?\d\d\s?\.\s?\d\d\s?\.\s?\d\d\s?\-\s?\d\d?"); foreach (Match eachMatch in matchCodigos) { codigosExistentes.Add(eachMatch.Value); } var codigosUsados = new HashSet <string>(); // Regexes that are going to be used. var regexCapitulo = new Regex(@"^\s*(?<NUM>\d+)?\s*CAPÍTULO\s*(?<NUM>\d+)?\s*(?<NAME>.*?)\s*$", RegexOptions.IgnoreCase | RegexOptions.Singleline); var regexProcedimentoTipo = new Regex( @"^([\s\w/\-]+?)\s+(\d\.\d\d\.\d\d\.\d\d\-\d)$", RegexOptions.IgnoreCase); var regexProcedimentoSubtipo = new Regex( @"^(.*?)\s+\(\s?(\d\.\d\d\.\d\d\.\d\d\-\d)\)$", RegexOptions.IgnoreCase); Regex regexProcedimentosCapitulo = null; // this one is dynamic var regexColumnTitles = new Regex( @"^(?:(?:(Código|Procedimentos|Porte|Custo|Oper\.|Nº de|Aux\.?|Anest\.|Inc\.|Filme|ou Doc|UR)|(\w+\.?))(?:\s+|$))+$", RegexOptions.IgnoreCase); var mapColNamePattern = new Dictionary <string, string> { { "código", null }, { "procedimentos", null }, { "porte", @"(?<PORTE>(?:(?:[\d,]+\s*de\s*)?\d+[a-z]+)|-)" }, { "custo oper", @"(?<CUSTO>-|\d+,\d{2,3})" }, { "porte anest", @"(?<ANEST>-|\d+)" }, { "nº de aux", @"(?<AUX>-|\d+)" }, { "ur", @"(?<UR>\*)" }, { "filme ou doc", @"(?<DOC>-|\d+,\d{4})" }, { "inc", @"(?<INC>-|\d+)" }, }; const string patternProcLine = @" ^ (?<CODE>\d\.\d\d\.\d\d\.\d\d\-\d)?\s* (?<NAME>.+?)\s* (?: (?<DOTS>[\.\s]*\.\s*) {0} )? \s* (?:(?=\r\n|\r|\n|$)) "; Regex regexProcLine = null; var regexAnexo = new Regex(@"^(OBSERVAÇÕES.*?|OBSERVAÇão.*?|INSTRUÇÕES.*?)\s*\:?$", RegexOptions.IgnoreCase); var regexPageFooter = new Regex( @"^(?<PAGE>\d+)?\s*Classificação Brasileira Hierarquizada de Procedimentos Médicos - (?:\d{4}|(?:\d{1,2}.*?edi[cç][aã]o))\s*(?<PAGE>\d+)?$", RegexOptions.IgnoreCase); var regexNumber = new Regex(@"^\d+$"); // State variables. Pagina curPagina = null; Capitulo curCapitulo = null; ProcTipo curProcTipo = null; ProcSubtipo curProcSubtipo = null; TextoAnexo curAnexo = null; // this can span multiples pages var builder = new StringBuilder(10000); int pageNum = 0; string curGroupName = null; // this happens at page 154 e 155 bool isExplicitAttachment = false; // this happens at page 202 // Processing all pages and lines of the text. var splitPages = text.Split('\f'); for (int itPage = 0; itPage < splitPages.Length; itPage++) { var pageText = splitPages[itPage]; var matchExplicitMissing = Regex.Matches(pageText, @"####EXPLICIT-MISSING:(.*?)(?=####|\r\n|\n|\r|$)"); foreach (var eachMatch in matchExplicitMissing.Cast <Match>()) { codigosUsados.Add(eachMatch.Groups[1].Value); } pageText = Regex.Replace(pageText, @" *####(?:ERRO|EXPLICIT-MISSING).*?(?=\r|\n|$)", ""); curPagina = this.CreatePagina(itPage, Regex.Replace(pageText, @" *####.*?(?=\r|\n|$)", "")); var matchCapitulo = regexCapitulo.Match(pageText); if (matchCapitulo.Success) { // If there is an attachment being read, we need to finalize it. if (curAnexo != null) { curAnexo.Texto = builder.ToString().Trim(); CheckForMultipleLines(curAnexo.Texto); builder.Clear(); curAnexo = null; } // If this page is the chapter title we must create a new chapter object. int num = int.Parse(matchCapitulo.Groups["NUM"].Value); string name = Regex.Replace(matchCapitulo.Groups["NAME"].Value, @"\s+", " "); curCapitulo = this.CreateCapitulo(num, name, curPagina); // Setting the new value of the regex, that depends on the name of the chapter. regexProcedimentosCapitulo = new Regex( string.Format(@"^{0}$", Regex.Replace(name, @"\s+", @"\s+")), RegexOptions.IgnoreCase); continue; } if (curCapitulo != null) { // State variables to read the lines. bool canReadCapitulo = true; bool canReadProcTipo = true; bool canReadColumnTitle = true; int columnTitleLines = 0; // lines of text devoted to the column titles. bool foundPageFooter = false; bool foundPageNum = false; var splitLines = Regex.Split(pageText, @"\r\n|\r|\n"); for (int itLine = 0; itLine < splitLines.Length; itLine++) { var lineText = splitLines[itLine].Trim(); if (lineText.Contains("####ANEXO-INICIO")) { isExplicitAttachment = true; } if (lineText.Contains("####ANEXO-FIM")) { isExplicitAttachment = false; lineText = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", ""); } if (string.IsNullOrEmpty(lineText)) { // We use blank lines only in the attachments, such as observations and instructions. bool canReadAnexo = !canReadCapitulo && !canReadProcTipo && !canReadColumnTitle && !foundPageFooter; if (curAnexo != null && canReadAnexo) { builder.AppendLine(); } continue; } if (lineText.Contains("####GROUP")) { lineText = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", ""); curGroupName = lineText; continue; } if (lineText.Contains("####ANEXO-CAPITULO")) { // This item has no specific code. lineText = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", ""); curAnexo = this.CreateAnexo( null, lineText.Trim(':').Trim().ToUpperInvariant(), curPagina, null, null); curAnexo.Capitulo = curCapitulo; curCapitulo.Anexos.Add(curAnexo); builder.Clear(); builder.AppendLine(lineText); continue; } // Chapter heading that exists in every page. // When we find this heading, then we set the page's chapter property. if (canReadCapitulo) { var matchProcCapitulo = regexProcedimentosCapitulo.Match(lineText); if (matchProcCapitulo.Success) { // Seting the chapter that the page belongs to. curPagina.Capitulo = curCapitulo; canReadCapitulo = false; continue; } } // Reading the ProcTipo heading that exists after the Chapter heading. // This heading contains the level-2 node of the CBHPM tree. if (canReadProcTipo) { var matchProcedimentoTipo = regexProcedimentoTipo.Match(lineText); if (matchProcedimentoTipo.Success) { // All pages inside the same ProcTipo have the same heading, // so we use a method that creates the object if it does not exist, // or returns the existing one. var curProcTipo2 = this.GetOrCreateProcTipo( matchProcedimentoTipo.Groups[2].Value, matchProcedimentoTipo.Groups[1].Value, curPagina, curCapitulo); if (curProcTipo != curProcTipo2) { // If there is an attachment being read, we need to finalize it. if (curAnexo != null) { curAnexo.Texto = builder.ToString().Trim(); CheckForMultipleLines(curAnexo.Texto); builder.Clear(); curAnexo = null; } } curProcTipo = curProcTipo2; codigosUsados.Add(curProcTipo.Codigo); canReadProcTipo = false; continue; } } // All pages that contains medical procedures, have the titles of the columns. // We need these columns to know what is inside each column, and set the values // correctly. if (canReadColumnTitle) { var matchColumnTitles = regexColumnTitles.Match(lineText); if (matchColumnTitles.Success && !matchColumnTitles.Groups[2].Success) { var columnCaptures = matchColumnTitles.Groups[1].Captures.Cast <Capture>().ToArray(); foreach (var eachCapture in columnCaptures) { curPagina.ColunaAdd(eachCapture.Value.TrimEnd('.')); } columnTitleLines++; if (columnTitleLines > 2) { throw new Exception("More than 2 lines for column titles."); } continue; } if (columnTitleLines == 0) { throw new Exception("No column titles have been found."); } // If there is no more columns to read, then we create the regex to read // each medical procedure. var valuePatternsOfPage = new List <string>(); foreach (var eachColName in curPagina.Colunas) { var colPattern = mapColNamePattern[eachColName]; if (colPattern != null) { valuePatternsOfPage.Add(colPattern); } } var patternValues = string.Format(@"(?:{0})", string.Join(@"\s+", valuePatternsOfPage)); regexProcLine = new Regex( string.Format(patternProcLine, patternValues), RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace); canReadColumnTitle = false; } // Reading the footer of the page. // The footer can not come before the end of the page. { var matchPageFooter = regexPageFooter.Match(lineText); if (matchPageFooter.Success) { if (foundPageFooter) { throw new Exception("Page footer duplicated."); } var strPageNum = matchPageFooter.Groups["PAGE"].Value; if (int.TryParse(strPageNum, out pageNum)) { if (curPagina.Numero != null && curPagina.Numero != pageNum) { throw new Exception("Page already has a number."); } if (foundPageNum) { throw new Exception("Page number duplicated."); } foundPageNum = true; curPagina.Numero = pageNum; pageNum = 0; } foundPageFooter = true; continue; } var matchNumber = regexNumber.Match(lineText); if (matchNumber.Success) { pageNum = int.Parse(lineText); if (!foundPageFooter || curPagina.Numero != null && curPagina.Numero != pageNum) { throw new Exception("Found a lost number."); } if (foundPageNum) { throw new Exception("Page number duplicated."); } foundPageNum = true; curPagina.Numero = pageNum; pageNum = 0; continue; } } if (foundPageFooter) { throw new Exception("Cannot read after page footer."); } // Reading the remaining of the attachment, if there is a current attachment. // The text of the attachment always start by multiple spaces. if (curAnexo != null) { if (splitLines[itLine].StartsWith(" ") || isExplicitAttachment) { lineText = Regex.Replace(splitLines[itLine], @" *####.*?(?=\r|\n|$)", ""); builder.AppendLine(lineText); continue; } } // If there is an attachment being read, we need to finalize it. if (curAnexo != null) { curAnexo.Texto = builder.ToString().Trim(); CheckForMultipleLines(curAnexo.Texto); builder.Clear(); curAnexo = null; } // Reading ProcSubtipo. A page may contain multiple subtipos. // Each subtipo exists only once in the document, it is not repeated in all // pages like ProcTipo. { var subtipoText = lineText; Match matchProcedimentoSubtipo = null; for (int itNext = 0; itNext < 3; itNext++) { matchProcedimentoSubtipo = regexProcedimentoSubtipo.Match(subtipoText); if (matchProcedimentoSubtipo.Success) { itLine += itNext; break; } var nextLine = splitLines[itLine + itNext + 1]; if (!Regex.IsMatch(nextLine, @"^\s?\w+") || Regex.IsMatch(nextLine, @"(?<!\()\d\.\d\d\.\d\d\.\d\d-\d(?!\))")) { break; } // If not match, then we try to append the next line, and do the match again. // Some SubtipoTexts are split in 2/3 lines. subtipoText = string.Format("{0} {1}", subtipoText, nextLine); } if (matchProcedimentoSubtipo.Success) { curProcSubtipo = this.CreateProcSubtipo( matchProcedimentoSubtipo.Groups[2].Value, matchProcedimentoSubtipo.Groups[1].Value, curPagina, curProcTipo); curGroupName = null; codigosUsados.Add(curProcSubtipo.Codigo); continue; } } // Reading the medical procedures in the current page. // Note that some of these items may not be procedures but observations. { var matchProcedimento = regexProcLine.Match(lineText); if (matchProcedimento.Success && matchProcedimento.Groups["CODE"].Success) { string code = matchProcedimento.Groups["CODE"].Value; var nameLine1 = matchProcedimento.Groups["NAME"].Value; if (regexAnexo.IsMatch(nameLine1)) { curAnexo = this.CreateAnexo( code, nameLine1.Trim(':').Trim().ToUpperInvariant(), curPagina, curProcTipo, curProcSubtipo); codigosUsados.Add(code); builder.Clear(); builder.AppendLine(nameLine1); } else { builder.Clear(); builder.Append(nameLine1); // Reading next lines until we find the end of the medical procedure. for (; itLine < splitLines.Length;) { if (matchProcedimento.Groups["PORTE"].Success) { string procName = builder.ToString(); builder.Clear(); string procPorte = matchProcedimento.Groups["PORTE"].Value; string procCusto = matchProcedimento.Groups["CUSTO"].Value; string procAnest = matchProcedimento.Groups["ANEST"].Value; string procAux = matchProcedimento.Groups["AUX"].Value; string procUR = matchProcedimento.Groups["UR"].Value; string procInc = matchProcedimento.Groups["INC"].Value; string procDoc = matchProcedimento.Groups["DOC"].Value; var proc = new Proc(); proc.Cbhpm = this; proc.Codigo = code; proc.Nome = procName; proc.Porte = procPorte; proc.CustoOper = procCusto; proc.PorteAnest = procAnest; proc.NumAux = procAux; proc.Ur = procUR; proc.Inc = procInc; proc.FilmeOuDoc = procDoc; proc.GrupoNoSubtipo = curGroupName; proc.Subtipo = curProcSubtipo; curProcSubtipo.Procedimentos.Add(proc); proc.PaginaDeclarada = curPagina; var simpleCode = GetCodeSimple(code); this.Items.Add(StructTuple.Create(simpleCode, 0), proc); codigosUsados.Add(code); break; } itLine++; lineText = splitLines[itLine].Trim(); matchProcedimento = regexProcLine.Match(lineText); if (Regex.IsMatch(lineText, @"\d\.\d\d\.\d\d\.\d\d-\d")) { throw new Exception("Code must be found only in first line of medical procedure."); } if (!splitLines[itLine].StartsWith(" ")) { throw new Exception("All lines after the first one of medical procedure, must start with spaces."); } builder.Append(' ' + matchProcedimento.Groups["NAME"].Value); } if (itLine >= splitLines.Length) { throw new Exception("Medical procedure not terminated."); } } continue; } throw new Exception("Unknown line found."); } } } // Checking the columns of the page. FinalizarPagina(curPagina); // Finishing the page and checking page properties. if (curPagina.Capitulo != curCapitulo && !string.IsNullOrEmpty(pageText)) { throw new Exception(); } if (curPagina.ProcTipo != curProcTipo && !string.IsNullOrEmpty(pageText)) { throw new Exception(); } } // Checking codes that were not processed. var codigosNaoUsados = new HashSet <string>(codigosExistentes); codigosNaoUsados.ExceptWith(codigosUsados); if (codigosNaoUsados.Any()) { throw new Exception("There are unused codes!"); } }