예제 #1
0
        private void ReadPdf(String pdfSourceFile)
        {
            PdfReader pdfReader = new PdfReader(pdfSourceFile);

            //get column x-locations on page 1
            var strategy    = new MyLocationTextExtractionStrategy();
            var pageOneText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);

            GetColumnXPoints(strategy);

            //loop thru grid data
            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                strategy = new MyLocationTextExtractionStrategy();
                var allPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                var isContinue  = FillDataSet(strategy);
                if (!isContinue)
                {
                    break;
                }
            }
            pdfReader.Close();

            var csv = ConvertDataSetToCSV();

            WriteFile("longbeach_crime.csv", csv);
        }
예제 #2
0
        private bool FillDataSet(MyLocationTextExtractionStrategy strategy)
        {
            var isContinue  = true;
            var row         = new Dictionary <CrimeCodes, int>();
            var shouldStart = false;

            foreach (MyLocationTextExtractionStrategy.TextChunk chunk in strategy.locationalResult)
            {
                try
                {
                    //get current text (should be string of integer)
                    var text = chunk.text.Trim();
                    //time to end
                    if (text.Equals("SHPD") || text.Equals("Unk"))
                    {
                        //if the end is reached we don't want to iterate more pages in ReadPdf()
                        isContinue = false;
                        break;
                    }
                    //loop until first 3 digit region code appears
                    if (text.Length == 3)
                    {
                        shouldStart = true;
                    }
                    //skip over first few rows of header text column names
                    if (!shouldStart)
                    {
                        continue;
                    }
                    //if text is 3 digit region code, write row and clear for new row
                    if (text.Length == 3)
                    {
                        WriteRow(ref row);
                    }
                    //convert to integer, default to zero
                    var count = 0;
                    if (!Int32.TryParse(text, out count))
                    {
                        continue;
                    }
                    //get x location of current text
                    float xloc = chunk.startLocation[0];
                    //get correct columnt o use given the text's x location
                    var crimeCode = GetCrimeCode(xloc);
                    row.Add(crimeCode, count);
                    Console.WriteLine(text);
                }
                catch (Exception ex)
                {
                    Console.Write($"Error: {chunk.text}, {ex.Message}");
                }
            }
            WriteRow(ref row);
            return(isContinue);
        }
예제 #3
0
        private void GetColumnXPoints(MyLocationTextExtractionStrategy strategy)
        {
            CodeLookup.Clear();

            foreach (MyLocationTextExtractionStrategy.TextChunk chunk in strategy.locationalResult)
            {
                string text = chunk.text.Trim().Replace(" ", "_");
                //extract year/month
                if (text.StartsWith("REPORT_MONTH") && text.IndexOf('=') > -1 && text.IndexOf('/') > -1 && text.IndexOf('-') > -1)
                {
                    //"REPORT_MONTH_=_01/01/2017_-_01/31/2017"
                    var str1 = text.Replace("_", "").Split('=')[1];
                    //"01/01/2017-01/31/2017"
                    var monthStr = str1.Split('/')[0];
                    var yearStr  = str1.Split('/')[2].Split('-')[0];
                    Int32.TryParse(monthStr, out month);
                    Int32.TryParse(yearStr, out year);
                    continue;
                }
                CrimeCodes crimeCode;
                if (Enum.TryParse <CrimeCodes>(text, true, out crimeCode) && Enum.IsDefined(typeof(CrimeCodes), text))
                {
                    //special considerations
                    if (text.Equals("AUTO") && !CodeLookup.ContainsKey(CrimeCodes.BIKE))
                    {
                        continue;
                    }
                    else if (text.Equals("TOTAL") && !CodeLookup.ContainsKey(CrimeCodes.PART_2))
                    {
                        continue;
                    }
                    //Add enum and float to Lookup List
                    if (!CodeLookup.ContainsKey(crimeCode))
                    {
                        CodeLookup.Add(crimeCode, chunk.startLocation[0].Truncate(0));
                    }
                }
            }
            //order columns by x location
            CodeLookup = CodeLookup.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
        }