/// <summary> /// 把单元格中的检查与建议内容抓取出来 /// </summary> /// <param name="parentNode"></param> /// <param name="cont"></param> private void crawlScanAndSuggestion(mshtml.IHTMLElement td, ScanEntity se) { string content = td.innerText; if (content != null && !"".Equals(content.Trim())) { //log.WriteLog("报告原文:【" + content + "】"); if (content.Contains("检查所见")) { se.YXBXHJCSJ = EpisodeRegexUtils.getFirstMatchedFromString(content, @"^\s*检查所见:\s*\n+(.+)$", true); } else if (content.Contains("诊断意见")) { se.JCZDHTS = EpisodeRegexUtils.getFirstMatchedFromString(content, @"^\s*诊断意见:\s*\n+(.+)$", true); } } }
public Dictionary <string, string> ExtractTextFromPdf() { PdfReader pdfReader = new PdfReader(new Uri(url)); int numberOfPages = pdfReader.NumberOfPages; StringBuilder text = new StringBuilder(); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); Dictionary <string, string> scanItem = new Dictionary <string, string>(); string page; for (int i = 1; i <= numberOfPages; ++i) { page = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy); string[] linesOfPage = page.Split('\n'); foreach (string line in linesOfPage) { //log.WriteLog("line=" + line); try { if (line.Contains("检查时间")) { string dateString = EpisodeRegexUtils.getFirstMatchedFromString(line, @"^检查时间:(.+)$"); DateTime dt = DateTime.ParseExact(dateString, "yyyy年MM月dd日 HH:mm:ss", System.Globalization.CultureInfo.CurrentCulture); scanItem.Add("JCSJ", dt.ToString("yyyy-MM-dd HH:mm:ss")); } if (line.Contains("报告时间")) { string dateString = EpisodeRegexUtils.getFirstMatchedFromString(line, @"^报告时间:(.+)$"); DateTime dt = DateTime.ParseExact(dateString, "yyyy年MM月dd日", System.Globalization.CultureInfo.CurrentCulture); string bgsj = dt.ToString("yyyy-MM-dd"); scanItem.Add("BGSJ", bgsj); } } catch (Exception e) { log.WriteLog("【" + line + "】解析失败:" + e.ToString() + e.StackTrace); } } } pdfReader.Close(); return(scanItem); }
/** * 根据访问url获取爬虫实例 * */ public static Crawler getCrawler(string url) { //超声报告爬虫 if (EpisodeRegexUtils.matchUrl(url, @"RisWeb3/ReportContent[.]aspx(.+?)LOC=549[&]STYLE=RIS3[-]4$")) { UltrasonicCrawler crawler = new UltrasonicCrawler(); string jch = EpisodeRegexUtils.getFirstMatchedFromString(url, @"SID=(.+?)[&]"); crawler.JCH = jch; log.WriteLog("完成检查超声报告内容爬虫构造,开始爬取检查超声报告内容。检查号JCH='" + jch + "', 患者就诊信息Episode==null is " + (Crawler.episode == null)); return(crawler); } //PDF报告爬虫 else if (url.ToUpper().EndsWith(".PDF")) { PdfCrawler crawler = new PdfCrawler(); int dotIdx = url.LastIndexOf("."); int slashIdx = url.LastIndexOf("/"); string fileName = url.Substring(slashIdx + 1, (dotIdx - slashIdx - 1)); crawler.fileName = fileName; crawler.url = url; log.WriteLog("完成PDF报告爬虫构造,开始爬取PDF报告内容。fileName=" + fileName); return(crawler); } //检查列表爬虫 else if (EpisodeRegexUtils.matchUrl(url, @"epr[.]chart[.]csp[?]PatientID=(\d+?)[&]EpisodeID=(\d+?)[&]EpisodeIDs=[&]mradm=(\d+?)[&]ChartID=23")) { ScanListCrawler crawler = new ScanListCrawler(); Crawler.episode = EpisodeRegexUtils.getEpisodeFromUrl(url); log.WriteLog("完成检查列表爬虫构造,开始爬取检查列表内容。PatientID=" + Crawler.episode.PatientID + ", EpisodeID=" + Crawler.episode.EpisodeID); return(crawler); } //return new DefaultCrawler(); return(null); }