コード例 #1
0
ファイル: Cbhpm.cs プロジェクト: manolopesquera/Cerebello
        public void LoadFromTextCopiedOfPdf(string text)
        {
            // Reading all codes that exist in the text, so that we can compare with
            // the list of processed codes to know if any is missing at the end.
            var codigosExistentes = new HashSet <string>();
            var matchCodigos      = Regex.Matches(text, @"\d\s?\.\s?\d\d\s?\.\s?\d\d\s?\.\s?\d\d\s?\-\s?\d\d?");

            foreach (Match eachMatch in matchCodigos)
            {
                codigosExistentes.Add(eachMatch.Value);
            }
            var codigosUsados = new HashSet <string>();

            // Regexes that are going to be used.
            var regexCapitulo = new Regex(@"^\s*(?<NUM>\d+)?\s*CAPÍTULO\s*(?<NUM>\d+)?\s*(?<NAME>.*?)\s*$", RegexOptions.IgnoreCase | RegexOptions.Singleline);

            var regexProcedimentoTipo = new Regex(
                @"^([\s\w/\-]+?)\s+(\d\.\d\d\.\d\d\.\d\d\-\d)$",
                RegexOptions.IgnoreCase);

            var regexProcedimentoSubtipo = new Regex(
                @"^(.*?)\s+\(\s?(\d\.\d\d\.\d\d\.\d\d\-\d)\)$",
                RegexOptions.IgnoreCase);

            Regex regexProcedimentosCapitulo = null; // this one is dynamic

            var regexColumnTitles = new Regex(
                @"^(?:(?:(Código|Procedimentos|Porte|Custo|Oper\.|Nº de|Aux\.?|Anest\.|Inc\.|Filme|ou Doc|UR)|(\w+\.?))(?:\s+|$))+$",
                RegexOptions.IgnoreCase);

            var mapColNamePattern = new Dictionary <string, string>
            {
                { "código", null },
                { "procedimentos", null },
                { "porte", @"(?<PORTE>(?:(?:[\d,]+\s*de\s*)?\d+[a-z]+)|-)" },
                { "custo oper", @"(?<CUSTO>-|\d+,\d{2,3})" },
                { "porte anest", @"(?<ANEST>-|\d+)" },
                { "nº de aux", @"(?<AUX>-|\d+)" },
                { "ur", @"(?<UR>\*)" },
                { "filme ou doc", @"(?<DOC>-|\d+,\d{4})" },
                { "inc", @"(?<INC>-|\d+)" },
            };

            const string patternProcLine = @"
                ^
                (?<CODE>\d\.\d\d\.\d\d\.\d\d\-\d)?\s*
                (?<NAME>.+?)\s*
                (?:
                  (?<DOTS>[\.\s]*\.\s*)
                  {0}
                )?
                \s*
                (?:(?=\r\n|\r|\n|$))
                ";

            Regex regexProcLine = null;

            var regexAnexo = new Regex(@"^(OBSERVAÇÕES.*?|OBSERVAÇão.*?|INSTRUÇÕES.*?)\s*\:?$", RegexOptions.IgnoreCase);

            var regexPageFooter = new Regex(
                @"^(?<PAGE>\d+)?\s*Classificação Brasileira Hierarquizada de Procedimentos Médicos - (?:\d{4}|(?:\d{1,2}.*?edi[cç][aã]o))\s*(?<PAGE>\d+)?$",
                RegexOptions.IgnoreCase);

            var regexNumber = new Regex(@"^\d+$");

            // State variables.
            Pagina      curPagina            = null;
            Capitulo    curCapitulo          = null;
            ProcTipo    curProcTipo          = null;
            ProcSubtipo curProcSubtipo       = null;
            TextoAnexo  curAnexo             = null; // this can span multiples pages
            var         builder              = new StringBuilder(10000);
            int         pageNum              = 0;
            string      curGroupName         = null;  // this happens at page 154 e 155
            bool        isExplicitAttachment = false; // this happens at page 202

            // Processing all pages and lines of the text.
            var splitPages = text.Split('\f');

            for (int itPage = 0; itPage < splitPages.Length; itPage++)
            {
                var pageText = splitPages[itPage];

                var matchExplicitMissing = Regex.Matches(pageText, @"####EXPLICIT-MISSING:(.*?)(?=####|\r\n|\n|\r|$)");
                foreach (var eachMatch in matchExplicitMissing.Cast <Match>())
                {
                    codigosUsados.Add(eachMatch.Groups[1].Value);
                }

                pageText = Regex.Replace(pageText, @" *####(?:ERRO|EXPLICIT-MISSING).*?(?=\r|\n|$)", "");

                curPagina = this.CreatePagina(itPage, Regex.Replace(pageText, @" *####.*?(?=\r|\n|$)", ""));

                var matchCapitulo = regexCapitulo.Match(pageText);

                if (matchCapitulo.Success)
                {
                    // If there is an attachment being read, we need to finalize it.
                    if (curAnexo != null)
                    {
                        curAnexo.Texto = builder.ToString().Trim();
                        CheckForMultipleLines(curAnexo.Texto);
                        builder.Clear();
                        curAnexo = null;
                    }

                    // If this page is the chapter title we must create a new chapter object.
                    int    num  = int.Parse(matchCapitulo.Groups["NUM"].Value);
                    string name = Regex.Replace(matchCapitulo.Groups["NAME"].Value, @"\s+", " ");

                    curCapitulo = this.CreateCapitulo(num, name, curPagina);

                    // Setting the new value of the regex, that depends on the name of the chapter.
                    regexProcedimentosCapitulo = new Regex(
                        string.Format(@"^{0}$", Regex.Replace(name, @"\s+", @"\s+")),
                        RegexOptions.IgnoreCase);

                    continue;
                }

                if (curCapitulo != null)
                {
                    // State variables to read the lines.
                    bool canReadCapitulo    = true;
                    bool canReadProcTipo    = true;
                    bool canReadColumnTitle = true;
                    int  columnTitleLines   = 0; // lines of text devoted to the column titles.

                    bool foundPageFooter = false;
                    bool foundPageNum    = false;

                    var splitLines = Regex.Split(pageText, @"\r\n|\r|\n");
                    for (int itLine = 0; itLine < splitLines.Length; itLine++)
                    {
                        var lineText = splitLines[itLine].Trim();

                        if (lineText.Contains("####ANEXO-INICIO"))
                        {
                            isExplicitAttachment = true;
                        }

                        if (lineText.Contains("####ANEXO-FIM"))
                        {
                            isExplicitAttachment = false;
                            lineText             = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", "");
                        }

                        if (string.IsNullOrEmpty(lineText))
                        {
                            // We use blank lines only in the attachments, such as observations and instructions.
                            bool canReadAnexo = !canReadCapitulo && !canReadProcTipo && !canReadColumnTitle && !foundPageFooter;
                            if (curAnexo != null && canReadAnexo)
                            {
                                builder.AppendLine();
                            }

                            continue;
                        }

                        if (lineText.Contains("####GROUP"))
                        {
                            lineText     = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", "");
                            curGroupName = lineText;

                            continue;
                        }

                        if (lineText.Contains("####ANEXO-CAPITULO"))
                        {
                            // This item has no specific code.
                            lineText = Regex.Replace(lineText, @" *####.*?(?=\r|\n|$)", "");
                            curAnexo = this.CreateAnexo(
                                null,
                                lineText.Trim(':').Trim().ToUpperInvariant(),
                                curPagina,
                                null,
                                null);

                            curAnexo.Capitulo = curCapitulo;
                            curCapitulo.Anexos.Add(curAnexo);

                            builder.Clear();
                            builder.AppendLine(lineText);

                            continue;
                        }

                        // Chapter heading that exists in every page.
                        // When we find this heading, then we set the page's chapter property.
                        if (canReadCapitulo)
                        {
                            var matchProcCapitulo = regexProcedimentosCapitulo.Match(lineText);

                            if (matchProcCapitulo.Success)
                            {
                                // Seting the chapter that the page belongs to.
                                curPagina.Capitulo = curCapitulo;

                                canReadCapitulo = false;

                                continue;
                            }
                        }

                        // Reading the ProcTipo heading that exists after the Chapter heading.
                        // This heading contains the level-2 node of the CBHPM tree.
                        if (canReadProcTipo)
                        {
                            var matchProcedimentoTipo = regexProcedimentoTipo.Match(lineText);

                            if (matchProcedimentoTipo.Success)
                            {
                                // All pages inside the same ProcTipo have the same heading,
                                // so we use a method that creates the object if it does not exist,
                                // or returns the existing one.
                                var curProcTipo2 = this.GetOrCreateProcTipo(
                                    matchProcedimentoTipo.Groups[2].Value,
                                    matchProcedimentoTipo.Groups[1].Value,
                                    curPagina,
                                    curCapitulo);

                                if (curProcTipo != curProcTipo2)
                                {
                                    // If there is an attachment being read, we need to finalize it.
                                    if (curAnexo != null)
                                    {
                                        curAnexo.Texto = builder.ToString().Trim();
                                        CheckForMultipleLines(curAnexo.Texto);
                                        builder.Clear();
                                        curAnexo = null;
                                    }
                                }

                                curProcTipo = curProcTipo2;

                                codigosUsados.Add(curProcTipo.Codigo);

                                canReadProcTipo = false;

                                continue;
                            }
                        }

                        // All pages that contains medical procedures, have the titles of the columns.
                        // We need these columns to know what is inside each column, and set the values
                        // correctly.
                        if (canReadColumnTitle)
                        {
                            var matchColumnTitles = regexColumnTitles.Match(lineText);

                            if (matchColumnTitles.Success && !matchColumnTitles.Groups[2].Success)
                            {
                                var columnCaptures = matchColumnTitles.Groups[1].Captures.Cast <Capture>().ToArray();
                                foreach (var eachCapture in columnCaptures)
                                {
                                    curPagina.ColunaAdd(eachCapture.Value.TrimEnd('.'));
                                }

                                columnTitleLines++;

                                if (columnTitleLines > 2)
                                {
                                    throw new Exception("More than 2 lines for column titles.");
                                }

                                continue;
                            }

                            if (columnTitleLines == 0)
                            {
                                throw new Exception("No column titles have been found.");
                            }

                            // If there is no more columns to read, then we create the regex to read
                            // each medical procedure.
                            var valuePatternsOfPage = new List <string>();
                            foreach (var eachColName in curPagina.Colunas)
                            {
                                var colPattern = mapColNamePattern[eachColName];
                                if (colPattern != null)
                                {
                                    valuePatternsOfPage.Add(colPattern);
                                }
                            }
                            var patternValues = string.Format(@"(?:{0})", string.Join(@"\s+", valuePatternsOfPage));
                            regexProcLine = new Regex(
                                string.Format(patternProcLine, patternValues),
                                RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);

                            canReadColumnTitle = false;
                        }

                        // Reading the footer of the page.
                        // The footer can not come before the end of the page.
                        {
                            var matchPageFooter = regexPageFooter.Match(lineText);

                            if (matchPageFooter.Success)
                            {
                                if (foundPageFooter)
                                {
                                    throw new Exception("Page footer duplicated.");
                                }

                                var strPageNum = matchPageFooter.Groups["PAGE"].Value;
                                if (int.TryParse(strPageNum, out pageNum))
                                {
                                    if (curPagina.Numero != null && curPagina.Numero != pageNum)
                                    {
                                        throw new Exception("Page already has a number.");
                                    }

                                    if (foundPageNum)
                                    {
                                        throw new Exception("Page number duplicated.");
                                    }

                                    foundPageNum = true;

                                    curPagina.Numero = pageNum;
                                    pageNum          = 0;
                                }

                                foundPageFooter = true;

                                continue;
                            }

                            var matchNumber = regexNumber.Match(lineText);

                            if (matchNumber.Success)
                            {
                                pageNum = int.Parse(lineText);

                                if (!foundPageFooter || curPagina.Numero != null && curPagina.Numero != pageNum)
                                {
                                    throw new Exception("Found a lost number.");
                                }

                                if (foundPageNum)
                                {
                                    throw new Exception("Page number duplicated.");
                                }

                                foundPageNum = true;

                                curPagina.Numero = pageNum;
                                pageNum          = 0;

                                continue;
                            }
                        }

                        if (foundPageFooter)
                        {
                            throw new Exception("Cannot read after page footer.");
                        }

                        // Reading the remaining of the attachment, if there is a current attachment.
                        // The text of the attachment always start by multiple spaces.
                        if (curAnexo != null)
                        {
                            if (splitLines[itLine].StartsWith("    ") || isExplicitAttachment)
                            {
                                lineText = Regex.Replace(splitLines[itLine], @" *####.*?(?=\r|\n|$)", "");
                                builder.AppendLine(lineText);
                                continue;
                            }
                        }

                        // If there is an attachment being read, we need to finalize it.
                        if (curAnexo != null)
                        {
                            curAnexo.Texto = builder.ToString().Trim();
                            CheckForMultipleLines(curAnexo.Texto);
                            builder.Clear();
                            curAnexo = null;
                        }

                        // Reading ProcSubtipo. A page may contain multiple subtipos.
                        // Each subtipo exists only once in the document, it is not repeated in all
                        // pages like ProcTipo.
                        {
                            var   subtipoText = lineText;
                            Match matchProcedimentoSubtipo = null;
                            for (int itNext = 0; itNext < 3; itNext++)
                            {
                                matchProcedimentoSubtipo = regexProcedimentoSubtipo.Match(subtipoText);
                                if (matchProcedimentoSubtipo.Success)
                                {
                                    itLine += itNext;
                                    break;
                                }

                                var nextLine = splitLines[itLine + itNext + 1];
                                if (!Regex.IsMatch(nextLine, @"^\s?\w+") || Regex.IsMatch(nextLine, @"(?<!\()\d\.\d\d\.\d\d\.\d\d-\d(?!\))"))
                                {
                                    break;
                                }

                                // If not match, then we try to append the next line, and do the match again.
                                // Some SubtipoTexts are split in 2/3 lines.
                                subtipoText = string.Format("{0} {1}", subtipoText, nextLine);
                            }

                            if (matchProcedimentoSubtipo.Success)
                            {
                                curProcSubtipo = this.CreateProcSubtipo(
                                    matchProcedimentoSubtipo.Groups[2].Value,
                                    matchProcedimentoSubtipo.Groups[1].Value,
                                    curPagina,
                                    curProcTipo);

                                curGroupName = null;

                                codigosUsados.Add(curProcSubtipo.Codigo);

                                continue;
                            }
                        }

                        // Reading the medical procedures in the current page.
                        // Note that some of these items may not be procedures but observations.
                        {
                            var matchProcedimento = regexProcLine.Match(lineText);

                            if (matchProcedimento.Success && matchProcedimento.Groups["CODE"].Success)
                            {
                                string code = matchProcedimento.Groups["CODE"].Value;

                                var nameLine1 = matchProcedimento.Groups["NAME"].Value;

                                if (regexAnexo.IsMatch(nameLine1))
                                {
                                    curAnexo = this.CreateAnexo(
                                        code,
                                        nameLine1.Trim(':').Trim().ToUpperInvariant(),
                                        curPagina,
                                        curProcTipo,
                                        curProcSubtipo);

                                    codigosUsados.Add(code);

                                    builder.Clear();
                                    builder.AppendLine(nameLine1);
                                }
                                else
                                {
                                    builder.Clear();
                                    builder.Append(nameLine1);
                                    // Reading next lines until we find the end of the medical procedure.
                                    for (; itLine < splitLines.Length;)
                                    {
                                        if (matchProcedimento.Groups["PORTE"].Success)
                                        {
                                            string procName = builder.ToString();
                                            builder.Clear();

                                            string procPorte = matchProcedimento.Groups["PORTE"].Value;
                                            string procCusto = matchProcedimento.Groups["CUSTO"].Value;
                                            string procAnest = matchProcedimento.Groups["ANEST"].Value;
                                            string procAux   = matchProcedimento.Groups["AUX"].Value;
                                            string procUR    = matchProcedimento.Groups["UR"].Value;
                                            string procInc   = matchProcedimento.Groups["INC"].Value;
                                            string procDoc   = matchProcedimento.Groups["DOC"].Value;

                                            var proc = new Proc();
                                            proc.Cbhpm      = this;
                                            proc.Codigo     = code;
                                            proc.Nome       = procName;
                                            proc.Porte      = procPorte;
                                            proc.CustoOper  = procCusto;
                                            proc.PorteAnest = procAnest;
                                            proc.NumAux     = procAux;
                                            proc.Ur         = procUR;
                                            proc.Inc        = procInc;
                                            proc.FilmeOuDoc = procDoc;

                                            proc.GrupoNoSubtipo = curGroupName;

                                            proc.Subtipo = curProcSubtipo;
                                            curProcSubtipo.Procedimentos.Add(proc);

                                            proc.PaginaDeclarada = curPagina;

                                            var simpleCode = GetCodeSimple(code);
                                            this.Items.Add(StructTuple.Create(simpleCode, 0), proc);

                                            codigosUsados.Add(code);

                                            break;
                                        }

                                        itLine++;
                                        lineText = splitLines[itLine].Trim();

                                        matchProcedimento = regexProcLine.Match(lineText);

                                        if (Regex.IsMatch(lineText, @"\d\.\d\d\.\d\d\.\d\d-\d"))
                                        {
                                            throw new Exception("Code must be found only in first line of medical procedure.");
                                        }

                                        if (!splitLines[itLine].StartsWith("    "))
                                        {
                                            throw new Exception("All lines after the first one of medical procedure, must start with spaces.");
                                        }

                                        builder.Append(' ' + matchProcedimento.Groups["NAME"].Value);
                                    }

                                    if (itLine >= splitLines.Length)
                                    {
                                        throw new Exception("Medical procedure not terminated.");
                                    }
                                }

                                continue;
                            }

                            throw new Exception("Unknown line found.");
                        }
                    }
                }

                // Checking the columns of the page.
                FinalizarPagina(curPagina);

                // Finishing the page and checking page properties.
                if (curPagina.Capitulo != curCapitulo && !string.IsNullOrEmpty(pageText))
                {
                    throw new Exception();
                }

                if (curPagina.ProcTipo != curProcTipo && !string.IsNullOrEmpty(pageText))
                {
                    throw new Exception();
                }
            }

            // Checking codes that were not processed.
            var codigosNaoUsados = new HashSet <string>(codigosExistentes);

            codigosNaoUsados.ExceptWith(codigosUsados);

            if (codigosNaoUsados.Any())
            {
                throw new Exception("There are unused codes!");
            }
        }