Beispiel #1
0
 public void CanExtractContentOrderText()
 {
     using (var document = PdfDocument.Open(GetFilename()))
     {
         foreach (var page in document.GetPages())
         {
             var text = ContentOrderTextExtractor.GetText(page);
             Assert.NotNull(text);
         }
     }
 }
        public static void Run(string filePath)
        {
            using (var document = PdfDocument.Open(filePath))
            {
                foreach (var page in document.GetPages())
                {
                    var text = ContentOrderTextExtractor.GetText(page, true);

                    Console.WriteLine(text);
                }
            }
        }
        public Document Parse(string name, Stream readStream)
        {
            // PdfDocument pdf = PdfDocument.Open(readStream);
            List <string> textList = new List <string>();

            using (var pdf = PdfDocument.Open(readStream)) {
                foreach (var page in pdf.GetPages())
                {
                    var lines = ContentOrderTextExtractor.GetText(page, true)
                                .Split(GetLineSeparator()).Select(line => line.Trim())
                                .Where(line => !line.Equals(""));
                    textList.AddRange(lines);
                }
            }
            string[] texts = textList.ToArray();

            Document doc = new Document();

            // Positional lookup for "relativeTo" rule
            Dictionary <string, int> extractedPosition = new Dictionary <string, int>();

            // Get configs
            JObject lookup = GetConfig()["lookup"] as JObject;

            if (lookup is null)
            {
                throw new Exception("Invalid parser config: lookup is null");
            }

            // Use a queue for iterating through properties, since some text extraction could depend on others
            Queue <PropertyInfo> propQueue = new Queue <PropertyInfo>(Document.Properties);

            while (propQueue.Count > 0)
            {
                PropertyInfo docProp = propQueue.Dequeue();
                JToken       config  = lookup[docProp.Name];

                if (config is null)
                {
                    continue;
                }

                JTokenType configType = config.Type;

                // Rules based on string matching
                if (configType.Equals(JTokenType.Object))
                {
                    JObject rules = config as JObject;

                    string target    = null;
                    int    targetPos = -1;
                    string value     = "";

                    if (rules["index"] != null)
                    {
                        target = texts[(int)rules["index"]];
                    }
                    else if (rules["startsWith"] != null)
                    {
                        JToken startsWith = rules["startsWith"];
                        if (startsWith.Type == JTokenType.String)
                        {
                            (target, targetPos) = findLine(texts, (text => text.StartsWith((string)rules["startsWith"])));
                        }
                        else
                        {
                            string[] startsWithes = startsWith.ToObject <string[]>();
                            (target, targetPos) = findLine(texts, text => startsWithes.Any(query => text.StartsWith(query)));
                        }
                    }
                    else if (rules["indexOf"] != null)
                    {
                        targetPos = Array.IndexOf(texts, (string)rules["indexOf"]);
                        if (targetPos < 0)
                        {
                            continue;
                        }
                        target = texts[targetPos];
                    }
                    else if (rules["relativeTo"] != null)
                    {
                        targetPos = extractedPosition.GetValueOrDefault((string)rules["relativeTo"], -1);

                        // Relative Member not parsed yet, requeue for processing
                        if (targetPos == -1)
                        {
                            propQueue.Enqueue(docProp);
                            continue;
                        }

                        target = texts[targetPos];
                    }

                    // Allow user to find specific line relative to somone common string like "Total Due"
                    if (rules["offset"] != null)
                    {
                        targetPos += (int)rules["offset"];
                        target     = texts[targetPos];
                    }

                    if (target is null)
                    {
                        continue;
                    }

                    extractedPosition.Add(docProp.Name, targetPos);

                    // Further extraction with regex if needed, select specific group of text
                    if (rules["regex"] != null)
                    {
                        string regexPattern = (string)rules["regex"];
                        Regex  regex        = new Regex(regexPattern);
                        Match  matched      = regex.Match(target);

                        int group = (int?)rules["group"] ?? 0;

                        value = matched.Groups[group].Value;
                    }
                    else
                    {
                        value = target;
                    }

                    docProp.SetValue(doc, serializeValue(value, docProp.PropertyType));
                }
            }

            return(doc);
        }