private string[] FormatLines(TextExtractionResult pdfContents) { var result = new List <string>(); var pdfText = pdfContents.Text.Replace("\r\n\r\n", "@@NEWPARAGRAPH@@").Replace("-\r\n", ""); var sb = new StringBuilder(); var paragraphs = pdfText.Replace("@@NEWPARAGRAPH@@", "\n").Replace("\n\n", "").Split('\n'); foreach (var paragraph in paragraphs) { var sentenceParts = paragraph.Split('.'); for (int i = 0; i < sentenceParts.Length; i++) { sb.Append(sentenceParts[i].Replace("-", "")); if (sb.Length > 0) { result.Add(sb.ToString() + "."); sb.Clear(); } } result.Add("\r\n"); } return(result.ToArray()); }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { var rslt = _regex.Match(rawInputBody.Text); if (rslt.Success) { var tmp = rslt.Groups[1].Value.ToUpper().Trim("\r\n ".ToCharArray()); var dists = tmp.Split(@"/\; ".ToCharArray()); var distribution = dists.Aggregate((left, right) => { return((left + ";" + right.ToUpper().Trim()).Replace("DIII", "D3").Replace("DII", "D2").Replace("DV", "D5")); }); distribution = distribution.Substring(0, distribution.IndexOf("FILE")).Trim("; ".ToCharArray()); distribution = _redundantColon.Replace(distribution, ";"); report.Add(KnownReportParts.PART_DISTRIBUTION, RemoveSpaces(distribution)); } else { if (_required) { throw new PartNotFoundException("Distribution was not found."); } } }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { if (DocEval.Any(d => report[KnownReportParts.PART_CNR].ToUpper().EndsWith(d))) { report.Add(KnownReportParts.PART_EVALUATION, "DOC"); return; } var match = _regex.Match(rawInputBody.Text); if (match.Success) { if (match.Groups[1].Value.ToUpper().Contains("DOC")) { report.Add(KnownReportParts.PART_EVALUATION, "DOC"); return; } else { var match2 = _regex2.Match(match.Groups[1].Value.ToUpper()); if (match2.Success) { report.Add(KnownReportParts.PART_EVALUATION, match2.Groups[1].Value + match2.Groups[2].Value); return; } } } if (_required) { throw new PartNotFoundException("Evaluation was not found"); } }
public void Build(Dictionary <string, string> report, TextExtractionResult textExtractionResult) { try { if (report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".SDDP")) { var matchSddp = _regexSddp.Match(textExtractionResult.Text); if (!matchSddp.Success) { if (_required) { throw new PartNotFoundException("Date of report not found"); } } var monthStr = matchSddp.Groups[2].Value.Trim().ToLower().Substring(0, 3); var monthNum = _lookup[monthStr]; var yearStr = matchSddp.Groups[3].Value.Trim(); var day = int.Parse(matchSddp.Groups[1].Value.Trim()); if (yearStr.Length == 2) { yearStr = (int.Parse(yearStr) + 2000).ToString(); } report.Add(KnownReportParts.PART_DATEOFREPORT_STR, $"{monthNum}-{day:00}-{yearStr}"); report.Add(KnownReportParts.PART_DATEOFREPORT_UTC, DateTime.Parse($"{yearStr}-{monthNum}-{day:00}").ToString("0")); return; } var match = _regex.Match(textExtractionResult.Text); if (match.Success) { var monthStr = match.Groups[2].Value.Trim().ToLower().Substring(0, 3); var monthNum = _lookup[monthStr]; var yearStr = match.Groups[3].Value.Trim(); var day = int.Parse(match.Groups[1].Value.Trim()); if (yearStr.Length == 2) { yearStr = (int.Parse(yearStr) + 2000).ToString(); } report.Add(KnownReportParts.PART_DATEOFREPORT_STR, $"{monthNum}-{day:00}-{yearStr}"); report.Add(KnownReportParts.PART_DATEOFREPORT_UTC, DateTime.Parse($"{yearStr}-{monthNum}-{day:00}").ToString("O")); } else { if (_required) { throw new PartNotFoundException("Date of report not found"); } } } catch (Exception) { throw new PartNotFoundException("Date of report not found"); } }
private void textBox1_DragDrop(object sender, DragEventArgs e) { string[] files = (string[])e.Data.GetData(DataFormats.FileDrop); if (files != null && files.Length != 0) { TextExtractionResult textExtractionResult = _textExtractor.Extract(files[0]); textBox1.Text = textExtractionResult.Text; } }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { Match match = null; if (OPNLS.Any(o => report[KnownReportParts.PART_CNR].ToUpper().EndsWith(o))) { report.Add(KnownReportParts.PART_SUBJECT, $"{report[KnownReportParts.REPORTTYPE].ToUpper()} '{report[KnownReportParts.PART_CNR]}'"); } else if (report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".ESR")) { match = _regex1.Match(rawInputBody.Text); if (match.Success) { report.Add(KnownReportParts.PART_SUBJECT, _regex3.Replace(Cleanup(match.Groups[1].Value), "")); } } else if (report[KnownReportParts.REPORTTYPE].ToUpper() == "AFTER MEETING REPORT" || report[KnownReportParts.REPORTTYPE].ToUpper() == "AFTER ACTIVITY REPORT" || report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".SDDP")) { if (rawInputBody.Metadata.ContainsKey("subject") && !string.IsNullOrWhiteSpace(rawInputBody.Metadata["subject"])) { report.Add(KnownReportParts.PART_SUBJECT, RemoveRedundantSpaces(rawInputBody.Metadata["subject"].ToUpper())); } else { report.Add(KnownReportParts.PART_SUBJECT, RemoveRedundantSpaces(report[KnownReportParts.REPORTTYPE].ToUpper())); } } else { match = _regex.Match(rawInputBody.Text); if (match.Success) { report.Add(KnownReportParts.PART_SUBJECT, RemoveInBetweenWhiteSpaces(match.Groups[2].Value.Trim("\r\n\t ".ToCharArray()).ToUpper())); } } if (!report.ContainsKey(KnownReportParts.PART_SUBJECT) && _required) { throw new PartNotFoundException("Subject was not found."); } }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { if (report.ContainsKey(KnownReportParts.PART_FILENAME) == false) { throw new PartNotFoundException("A filename is needed in order to parse the CNR of report."); } var rslt = _regex.Match(report[KnownReportParts.PART_FILENAME]); if (rslt.Success) { report[KnownReportParts.PART_CNR] = rslt.Groups[0].Value.Trim().ToUpper(); } else { if (_required) { throw new PartNotFoundException("CNR was not found."); } } }
public string ParseMediaText(byte[] data, Action <Exception> onError, out Dictionary <string, string> MetaData) { TextExtractor textExtractor = new TextExtractor(); var metaData = new Dictionary <string, string>(); var sb = new StringBuilder(); try { TextExtractionResult textExtractionResult = textExtractor.Extract(data); if (!string.IsNullOrWhiteSpace(textExtractionResult.Text)) { metaData = (Dictionary <string, string>)textExtractionResult.Metadata; sb.Append(textExtractionResult.Text); } } catch (Exception ex) { onError(ex); } MetaData = metaData; return(sb.ToString()); }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { if (report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".AMR") || report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".AAR") || report[KnownReportParts.PART_CNR].ToUpper().EndsWith(".SDDP")) { report.Add(KnownReportParts.PART_BODY, RemoveInBetweenWhiteSpaces(rawInputBody.Text)); return; } var match = _regex.Match(rawInputBody.Text); if (match.Success) { report.Add(KnownReportParts.PART_BODY, RemoveInBetweenWhiteSpaces(match.Groups[1].Value.Trim("\r\n\t ".ToCharArray()))); return; } var match2 = _regex2.Match(rawInputBody.Text); if (match2.Success) { report.Add(KnownReportParts.PART_BODY, RemoveInBetweenWhiteSpaces(match2.Groups[1].Value.Trim("\r\n\t ".ToCharArray()))); return; } if (string.IsNullOrWhiteSpace(rawInputBody.Text)) { throw new PartNotFoundException("Report body not found"); } report.Add(KnownReportParts.PART_BODY, Cleanup(rawInputBody.Text)); }
public void Build(Dictionary <string, string> report, TextExtractionResult rawInputBody) { if (report.ContainsKey(KnownReportParts.PART_CNR) == false) { throw new PartNotFoundException("CNR must exist to determine report type"); } var cnr = report[KnownReportParts.PART_CNR]; var match = _regex.Match(cnr); if (match.Success) { var key = match.Groups[1].Value.Trim().ToUpper(); var reportType = _reportTypes[key]; report.Add(KnownReportParts.REPORTTYPE, RemoveRedundantSpaces(reportType)); } else { if (_required) { throw new PartNotFoundException("Report Type not found."); } } }