PdfUnstructuredDoc C# (CSharp) Code Examples

Example #1

0

Show file

        /// <summary>
        /// Recupera un archivo de almacén de patrones.
        /// </summary>
        /// <param name="pdf">Documento.</param>
        /// <returns>Almacén de patrones.</returns>
        public static PdfTagPatternStore GetStore(PdfUnstructuredDoc pdf)
        {
            string path = $"{GetDirectory(pdf.DocCategory)}" +
                          $"{GetFileName(pdf.DocID)}";

            return(XmlParser.FromXml <PdfTagPatternStore>(path));
        }

Example #2

0

Show file

File: PdfCompareResult.cs Project: bautista225/PdfTagger-2.0

 /// <summary>
 /// Construye una nueva instancia de PdfCompareResult.
 /// </summary>
 /// <param name="pdf">PdfUnstructuredDoc a partir del cual se ha obtenido.</param>
 /// <param name="metadata">IMetadata a partir del cual se ha obtenido.</param>
 /// <param name="hierarchySet">Catálogo de jerarquías por tipo.</param>
 public PdfCompareResult(PdfUnstructuredDoc pdf, IMetadata metadata,
                         IHierarchySet hierarchySet) : this()
 {
     _Pdf          = pdf;
     _Metadata     = metadata;
     _HierarchySet = hierarchySet;
 }

Example #3

0

Show file

File: PdfTagPatternStore.cs Project: bautista225/PdfTagger

        /// <summary>
        /// Ejecuta patrones los extracción de textos
        /// almacenados.
        /// </summary>
        /// <param name="pdf"></param>
        /// <returns></returns>
        public PdfTagExtractionResult Extract(PdfUnstructuredDoc pdf)
        {
            PdfTagExtractionResult result = new PdfTagExtractionResult()
            {
                Pdf          = pdf,
                MetadataType = Type.GetType(MetadataName)
            };

            _Converters = new Dictionary <Type, object>();

            IHierarchySet hierarchySet = GetHierarchySet();

            foreach (var page in pdf.PdfUnstructuredPages)
            {
                ExtractFromRectangles(page.WordGroups,
                                      result.MetadataType, hierarchySet, result);

                ExtractFromRectangles(page.Lines,
                                      result.MetadataType, hierarchySet, result, "LinesInfos");

                ExtractFromText(result.MetadataType, result, page, hierarchySet);

                ExtractFromTextStrings(page.TextStringGroups,
                                       result.MetadataType, hierarchySet, result);
            }

            result.Converters = _Converters;

            result.GetMetadata();

            return(result);
        }

Example #4

0

Show file

File: PdfTagPatternFactory.cs Project: bautista225/PdfTagger-2.0

        /// <summary>
        /// Recupera un archivo de almacén de patrones.
        /// </summary>
        /// <param name="pdf">Documento.</param>
        /// <returns>Almacén de patrones.</returns>
        public static PdfTagPatternStore GetStore(PdfUnstructuredDoc pdf)
        {
            string path = $"C:\\ProgramData\\PdfTagger\\Patterns\\InvoicePdfTaggerOriginalModif\\" +
                          $"{GetFileName(pdf.DocID)}";

            //string path = $"{GetDirectory(pdf.DocCategory)}" +
            //$"{GetFileName(pdf.DocID)}";

            return(XmlParser.FromXml <PdfTagPatternStore>(path));
        }

Example #5

0

Show file

File: PdfCompareInfo.cs Project: bautista225/PdfTagger-2.0

 /// <summary>
 /// Construye una nueva instancia de la clase PdfCompareInfo.
 /// </summary>
 /// <param name="pdf">Instacia de la clase PdfUnstructuredDoc
 /// que se utilizó para la comparación a la que pertenence
 /// el info.</param>
 /// <param name="pdfPage">Instancia de la clase PdfUnstructuredPage
 /// de la colección PdfUnstructuredPages del pdf, sobre
 /// la que se obtuvo el resultado contenido en el
 /// info a crear.</param>
 /// <param name="pdfTextRectangle">PdfTextRectangle sobre el que
 /// se ha obetnido el resultado contenido en el info.</param>
 /// <param name="textParserMatch">ITextParserMatch orígen del info.</param>
 /// <param name="propertyInfo">PropetyInfo de la propiedad
 /// de los metadatos de la cual se a comparado el valor y se
 /// ha obtenido la coincidencia que ha generado el info.</param>
 public PdfCompareInfo(PdfUnstructuredDoc pdf,
                       PdfUnstructuredPage pdfPage,
                       PdfTextRectangle pdfTextRectangle,
                       ITextMatch textParserMatch,
                       PropertyInfo propertyInfo)
 {
     _Pdf              = pdf;
     _PdfPage          = pdfPage;
     _PdfTextRectangle = pdfTextRectangle;
     _TextMatch        = textParserMatch;
     _PropertyInfo     = propertyInfo;
 }

Example #6

0

Show file

File: PdfTagPatternStore.cs Project: bautista225/PdfTagger-2.0

        /// <summary>
        /// Ejecuta los patrones de extracción de textos
        /// almacenados.
        /// </summary>
        /// <param name="pdf">Archivo PDF sobre el que extraer.</param>
        /// <returns></returns>
        public PdfTagExtractionResult Extract(PdfUnstructuredDoc pdf)
        {
            PdfTagExtractionResult result = new PdfTagExtractionResult()
            {
                Pdf          = pdf,
                MetadataType = Type.GetType(MetadataName)
            };

            _Converters = new Dictionary <Type, object>();

            IHierarchySet hierarchySet = GetHierarchySet();

            PdfPatternsPage = new Dictionary <int, List <PdfTagPattern> >();
            foreach (PdfTagPattern pattern in PdfPatterns) // Evitar que los bucles de extracción recorran siempre todos los patrones idependientemente del número de página.
            {
                if (PdfPatternsPage.ContainsKey(pattern.PdfPageN))
                {
                    PdfPatternsPage[pattern.PdfPageN].Add(pattern);
                }
                else
                {
                    PdfPatternsPage[pattern.PdfPageN] = new List <PdfTagPattern>()
                    {
                        pattern
                    }
                };
            }

            foreach (var page in pdf.PdfUnstructuredPages)
            {
                ExtractFromRectangles(page.WordGroups,
                                      result.MetadataType, hierarchySet, result, page.PdfPageN);

                ExtractFromRectangles(page.Lines,
                                      result.MetadataType, hierarchySet, result, page.PdfPageN, "LinesInfos");

                ExtractFromText(result.MetadataType, result, page, hierarchySet);

                ExtractFromColorFontText(page.ColorFontWordGroups,
                                         result.MetadataType, hierarchySet, result, page.PdfPageN);
            }

            result.Converters = _Converters;

            result.GetMetadata();

            return(result);
        }

Example #7

0

Show file

        /// <summary>
        /// Dibuja los rectángulos configurados en un pdf invertidos x=y,
        /// y=x, x=width-x ... Rutina utilizada en casos extraños como el de
        /// Moinsa.
        /// </summary>
        /// <param name="pathTarget">Pdf destino.</param>
        /// <param name="baseColor">Color de los rectangulos.</param>
        public static void PrintInvertRectangles(string pathSource, string pathTarget,
                                                 PdfUnstructuredDoc pdf, BaseColor baseColor, bool lines = false)
        {
            try
            {
                PdfReader pdfReader = new PdfReader(pathSource);

                PdfStamper pdfStamper = new PdfStamper(pdfReader,
                                                       new FileStream(pathTarget, FileMode.OpenOrCreate));


                int p = 0;

                foreach (var page in pdf.PdfUnstructuredPages)
                {
                    p++;
                    PdfContentByte cb = pdfStamper.GetOverContent(p);

                    List <PdfTextRectangle> rectangles = (lines) ? page.Lines : page.WordGroups;

                    foreach (var reg in rectangles)
                    {
                        cb.SetColorStroke(baseColor);

                        Rectangle pageSize = pdfReader.GetPageSize(p);

                        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(
                            (pageSize.Height - reg.Lly), reg.Llx, (pageSize.Height - reg.Ury), reg.Urx);

                        cb.Rectangle(rect.Left, rect.Bottom,
                                     rect.Width, rect.Height);
                        cb.Stroke();
                    }
                }
                pdfStamper.Close();
            }
            catch (IOException ex)
            {
                throw ex;
            }
            catch (DocumentException ex)
            {
                throw ex;
            }
        }

Example #8

0

Show file

File: formInvoiceModel.cs Project: bautista225/PdfTagger

 /// <summary>
 /// Carga un documento pdf.
 /// </summary>
 /// <param name="path">Documento pdfs a analizar.</param>
 public void LoadPdfInvoiceDoc(string path)
 {
     PdfPath         = path;
     Pdf             = new PdfUnstructuredDoc(PdfPath);
     Pdf.DocCategory = "Invoice";
 }

Example #9

0

Show file

        /// <summary>
        /// Devuelve las coincidencias entre los datos
        /// del pdf, y los metadatos facilitados.
        /// <code lang="C#">
        ///         // Partiendo de una entrada de datos no estructurados de pdf
        ///         PdfUnstructuredDoc pdf = new PdfUnstructuredDoc(@"C:\ProgramData\PdfTagger\Inbox\0000021101.pdf");
        ///
        ///         // y de un conjunto de datos estructurados
        ///         InvoiceMetadata metadata = new InvoiceMetadata();
        ///
        ///         metadata.InvoiceNumber = "1 / 33050";
        ///         metadata.BuyerPartyID = "ES - A12070330";
        ///         metadata.IssueDate = new DateTime(2017, 11, 30);
        ///         metadata.GrossAmount = 3646.50m;
        ///         metadata.TaxesOutputsBase01 = 3013.64m;
        ///         metadata.TaxesOutputsRate01 = 21m;
        ///         metadata.TaxesOutputsAmount01 = 632.86m;
        ///
        ///         PdfCompareResult compareResult = PdfCompare.Compare(new BusinessHierarchySet(), pdf, metadata);
        /// </code>
        /// <code lang="VB">
        ///       ' Partiendo de una entrada de datos no estructurados de pdf
        ///        Dim pdf As PdfUnstructuredDoc = New PdfUnstructuredDoc(@"C:\ProgramData\PdfTagger\Inbox\0000021101.pdf")
        ///
        ///        ' y de un conjunto de datos estructurados
        ///        Dim metadata As InvoiceMetadata = New InvoiceMetadata()
        ///
        ///        metadata.InvoiceNumber = "1 / 33050"
        ///        metadata.BuyerPartyID = "ES - A12070330"
        ///        metadata.IssueDate = New Date(2017, 11, 30)
        ///        metadata.GrossAmount = CDec(3646.5)
        ///        metadata.TaxesOutputsBase01 = CDec(3013.64)
        ///        metadata.TaxesOutputsRate01 = 21
        ///        metadata.TaxesOutputsAmount01 = CDec(632.86)
        ///
        ///        Dim compareResult As PdfCompareResult = PdfCompare.Compare(New BusinessHierarchySet(), pdf, metadata)
        /// </code>
        /// </summary>
        /// <param name="hierarchySet">Catalogo de jerarquías de analizadores
        /// por tipo. La operación utilizara para comparar cada tipo de variable
        /// el parser obtenido del catálogo. La comparación se irá ejecutando
        /// por cada uno de los parsers según su orden en la jerarquía, hasta
        /// que se encuentre un valor coincidente o se llegue al final
        /// de la jerarquía.</param>
        /// <param name="pdf">Instancia de la clase PdfUnstructuredDoc fruto
        /// del análisis y obtención de los datos no estructurados de un pdf.</param>
        /// <param name="metadata">Datos estructurados a comparar con los
        /// datos no estructurados obtenidos del pdf.</param>
        /// <returns>Instancia de la clase PdfCompareResult con
        /// los resultados obtenidos de la comparación.</returns>
        public static PdfCompareResult Compare(IHierarchySet hierarchySet,
                                               PdfUnstructuredDoc pdf, IMetadata metadata)
        {
            PdfCompareResult compareResult = new PdfCompareResult(pdf, metadata, hierarchySet);

            foreach (PropertyInfo pInf in metadata.GetType().GetProperties())
            {
                object pValue = pInf.GetValue(metadata);

                // Obtengo la jerarquía de analizadores
                ITextParserHierarchy parserHierarchy = hierarchySet.GetParserHierarchy(pInf);

                if (pInf.PropertyType == typeof(string))
                {
                    parserHierarchy.SetParserRegexPattern(0, TxtRegex.Replace($"{pValue}"));
                }

                // Recorro todos los datos del pdf que quiero comparar
                if (parserHierarchy != null && pValue != null && !IsZeroNumeric(pValue))
                {
                    foreach (var page in pdf.PdfUnstructuredPages)
                    {
                        // Grupos de palabras
                        foreach (var wordGroup in page.WordGroups)
                        {
                            foreach (var match in parserHierarchy.GetMatches(pValue, wordGroup.Text))
                            {
                                compareResult.WordGroupsInfos.Add(new PdfCompareInfo(pdf, page, wordGroup, match, pInf, null));
                            }
                        }

                        // Grupos de líneas
                        foreach (var line in page.Lines)
                        {
                            foreach (var match in parserHierarchy.GetMatches(pValue, line.Text))
                            {
                                compareResult.LinesInfos.Add(new PdfCompareInfo(pdf, page, line, match, pInf, null));
                            }
                        }

                        // Grupos de texto con porpiedades como el color de la fuente
                        foreach (var textString in page.TextStringGroups)
                        {
                            foreach (var match in parserHierarchy.GetMatches(pValue, textString.Text))
                            {
                                PdfClownTextString tsNA = new PdfClownTextString(textString.Text, textString.ColorFill, textString.ColorStroke, textString.FontType, textString.FontSize)
                                {
                                    Rectangle = textString.Rectangle,
                                    Type      = "NA"
                                };

                                PdfClownTextString tsX = new PdfClownTextString(textString.Text, textString.ColorFill, textString.ColorStroke, textString.FontType, textString.FontSize)
                                {
                                    Type      = "X",
                                    Rectangle = textString.Rectangle
                                };

                                PdfClownTextString tsY = new PdfClownTextString(textString.Text, textString.ColorFill, textString.ColorStroke, textString.FontType, textString.FontSize)
                                {
                                    Type      = "Y",
                                    Rectangle = textString.Rectangle
                                };

                                compareResult.TextStringInfos.Add(new PdfCompareInfo(pdf, page, null, match, pInf, tsNA));
                                compareResult.TextStringInfos.Add(new PdfCompareInfo(pdf, page, null, match, pInf, tsX));
                                compareResult.TextStringInfos.Add(new PdfCompareInfo(pdf, page, null, match, pInf, tsY));
                            }
                        }


                        foreach (var match in parserHierarchy.GetMatches(pValue, page.PdfText))
                        {
                            Type       txtBoundMatchGenType = typeof(TextBoundMatch <>).MakeGenericType(pInf.PropertyType);
                            ITextMatch txtBoundMatch        = (ITextMatch)Activator.CreateInstance(txtBoundMatchGenType, match);
                            ITextMatch txtBoundMatchSoft    = (ITextMatch)Activator.CreateInstance(txtBoundMatchGenType, match);
                            (txtBoundMatchSoft as ITextBoundMatch).UseLengthOnPatternDigitReplacement = false;

                            if (txtBoundMatch.Pattern != null)
                            {
                                dynamic converter = parserHierarchy.GetConverter(match.Pattern);

                                // Límites contextuales
                                if (IsAllMatchesOK(txtBoundMatch, page, pValue, converter))
                                {
                                    compareResult.PdfTextInfos.Add(
                                        new PdfCompareInfo(pdf, page, null, txtBoundMatch, pInf, null));
                                }

                                // Límites contextuales menos estrictos
                                if (IsAllMatchesOK(txtBoundMatchSoft, page, pValue, converter))
                                {
                                    compareResult.PdfTextInfos.Add(
                                        new PdfCompareInfo(pdf, page, null, txtBoundMatchSoft, pInf, null));
                                }
                            }
                        }
                    }
                }
            }

            return(compareResult);
        }

Example #10

0

Show file

File: PdfCheckResult.cs Project: bautista225/PdfTagger-2.0

 /// <summary>
 /// Constructor de la clase PdfCheckResult.
 /// </summary>
 /// <param name="pdf">Información dessestructurada de un PDF.</param>
 /// <param name="metadata">Metadatos correctos procedentes de una B.DD.</param>
 public PdfCheckResult(PdfUnstructuredDoc pdf, IMetadata metadata)
 {
     Pdf      = pdf;
     Metadata = metadata;
 }

C# (CSharp) PdfUnstructuredDoc Examples