private void ReadPdf(String pdfSourceFile) { PdfReader pdfReader = new PdfReader(pdfSourceFile); //get column x-locations on page 1 var strategy = new MyLocationTextExtractionStrategy(); var pageOneText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy); GetColumnXPoints(strategy); //loop thru grid data for (int page = 1; page <= pdfReader.NumberOfPages; page++) { strategy = new MyLocationTextExtractionStrategy(); var allPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); var isContinue = FillDataSet(strategy); if (!isContinue) { break; } } pdfReader.Close(); var csv = ConvertDataSetToCSV(); WriteFile("longbeach_crime.csv", csv); }
private bool FillDataSet(MyLocationTextExtractionStrategy strategy) { var isContinue = true; var row = new Dictionary <CrimeCodes, int>(); var shouldStart = false; foreach (MyLocationTextExtractionStrategy.TextChunk chunk in strategy.locationalResult) { try { //get current text (should be string of integer) var text = chunk.text.Trim(); //time to end if (text.Equals("SHPD") || text.Equals("Unk")) { //if the end is reached we don't want to iterate more pages in ReadPdf() isContinue = false; break; } //loop until first 3 digit region code appears if (text.Length == 3) { shouldStart = true; } //skip over first few rows of header text column names if (!shouldStart) { continue; } //if text is 3 digit region code, write row and clear for new row if (text.Length == 3) { WriteRow(ref row); } //convert to integer, default to zero var count = 0; if (!Int32.TryParse(text, out count)) { continue; } //get x location of current text float xloc = chunk.startLocation[0]; //get correct columnt o use given the text's x location var crimeCode = GetCrimeCode(xloc); row.Add(crimeCode, count); Console.WriteLine(text); } catch (Exception ex) { Console.Write($"Error: {chunk.text}, {ex.Message}"); } } WriteRow(ref row); return(isContinue); }
private void GetColumnXPoints(MyLocationTextExtractionStrategy strategy) { CodeLookup.Clear(); foreach (MyLocationTextExtractionStrategy.TextChunk chunk in strategy.locationalResult) { string text = chunk.text.Trim().Replace(" ", "_"); //extract year/month if (text.StartsWith("REPORT_MONTH") && text.IndexOf('=') > -1 && text.IndexOf('/') > -1 && text.IndexOf('-') > -1) { //"REPORT_MONTH_=_01/01/2017_-_01/31/2017" var str1 = text.Replace("_", "").Split('=')[1]; //"01/01/2017-01/31/2017" var monthStr = str1.Split('/')[0]; var yearStr = str1.Split('/')[2].Split('-')[0]; Int32.TryParse(monthStr, out month); Int32.TryParse(yearStr, out year); continue; } CrimeCodes crimeCode; if (Enum.TryParse <CrimeCodes>(text, true, out crimeCode) && Enum.IsDefined(typeof(CrimeCodes), text)) { //special considerations if (text.Equals("AUTO") && !CodeLookup.ContainsKey(CrimeCodes.BIKE)) { continue; } else if (text.Equals("TOTAL") && !CodeLookup.ContainsKey(CrimeCodes.PART_2)) { continue; } //Add enum and float to Lookup List if (!CodeLookup.ContainsKey(crimeCode)) { CodeLookup.Add(crimeCode, chunk.startLocation[0].Truncate(0)); } } } //order columns by x location CodeLookup = CodeLookup.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value); }