Пример #1
0
        /// <summary>
        /// 把单元格中的检查与建议内容抓取出来
        /// </summary>
        /// <param name="parentNode"></param>
        /// <param name="cont"></param>
        private void crawlScanAndSuggestion(mshtml.IHTMLElement td, ScanEntity se)
        {
            string content = td.innerText;

            if (content != null && !"".Equals(content.Trim()))
            {
                //log.WriteLog("报告原文:【" + content + "】");
                if (content.Contains("检查所见"))
                {
                    se.YXBXHJCSJ = EpisodeRegexUtils.getFirstMatchedFromString(content, @"^\s*检查所见:\s*\n+(.+)$", true);
                }
                else if (content.Contains("诊断意见"))
                {
                    se.JCZDHTS = EpisodeRegexUtils.getFirstMatchedFromString(content, @"^\s*诊断意见:\s*\n+(.+)$", true);
                }
            }
        }
Пример #2
0
        public Dictionary <string, string> ExtractTextFromPdf()
        {
            PdfReader                   pdfReader     = new PdfReader(new Uri(url));
            int                         numberOfPages = pdfReader.NumberOfPages;
            StringBuilder               text          = new StringBuilder();
            ITextExtractionStrategy     strategy      = new SimpleTextExtractionStrategy();
            Dictionary <string, string> scanItem      = new Dictionary <string, string>();
            string                      page;

            for (int i = 1; i <= numberOfPages; ++i)
            {
                page = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
                string[] linesOfPage = page.Split('\n');
                foreach (string line in linesOfPage)
                {
                    //log.WriteLog("line=" + line);
                    try
                    {
                        if (line.Contains("检查时间"))
                        {
                            string   dateString = EpisodeRegexUtils.getFirstMatchedFromString(line, @"^检查时间:(.+)$");
                            DateTime dt         = DateTime.ParseExact(dateString, "yyyy年MM月dd日 HH:mm:ss", System.Globalization.CultureInfo.CurrentCulture);
                            scanItem.Add("JCSJ", dt.ToString("yyyy-MM-dd HH:mm:ss"));
                        }
                        if (line.Contains("报告时间"))
                        {
                            string   dateString = EpisodeRegexUtils.getFirstMatchedFromString(line, @"^报告时间:(.+)$");
                            DateTime dt         = DateTime.ParseExact(dateString, "yyyy年MM月dd日", System.Globalization.CultureInfo.CurrentCulture);
                            string   bgsj       = dt.ToString("yyyy-MM-dd");
                            scanItem.Add("BGSJ", bgsj);
                        }
                    }
                    catch (Exception e)
                    {
                        log.WriteLog("【" + line + "】解析失败:" + e.ToString() + e.StackTrace);
                    }
                }
            }
            pdfReader.Close();
            return(scanItem);
        }
 /**
  * 根据访问url获取爬虫实例
  * */
 public static Crawler getCrawler(string url)
 {
     //超声报告爬虫
     if (EpisodeRegexUtils.matchUrl(url, @"RisWeb3/ReportContent[.]aspx(.+?)LOC=549[&]STYLE=RIS3[-]4$"))
     {
         UltrasonicCrawler crawler = new UltrasonicCrawler();
         string            jch     = EpisodeRegexUtils.getFirstMatchedFromString(url, @"SID=(.+?)[&]");
         crawler.JCH = jch;
         log.WriteLog("完成检查超声报告内容爬虫构造,开始爬取检查超声报告内容。检查号JCH='" + jch
                      + "', 患者就诊信息Episode==null is " + (Crawler.episode == null));
         return(crawler);
     }
     //PDF报告爬虫
     else if (url.ToUpper().EndsWith(".PDF"))
     {
         PdfCrawler crawler  = new PdfCrawler();
         int        dotIdx   = url.LastIndexOf(".");
         int        slashIdx = url.LastIndexOf("/");
         string     fileName = url.Substring(slashIdx + 1, (dotIdx - slashIdx - 1));
         crawler.fileName = fileName;
         crawler.url      = url;
         log.WriteLog("完成PDF报告爬虫构造,开始爬取PDF报告内容。fileName=" + fileName);
         return(crawler);
     }
     //检查列表爬虫
     else if (EpisodeRegexUtils.matchUrl(url, @"epr[.]chart[.]csp[?]PatientID=(\d+?)[&]EpisodeID=(\d+?)[&]EpisodeIDs=[&]mradm=(\d+?)[&]ChartID=23"))
     {
         ScanListCrawler crawler = new ScanListCrawler();
         Crawler.episode = EpisodeRegexUtils.getEpisodeFromUrl(url);
         log.WriteLog("完成检查列表爬虫构造,开始爬取检查列表内容。PatientID=" + Crawler.episode.PatientID
                      + ", EpisodeID=" + Crawler.episode.EpisodeID);
         return(crawler);
     }
     //return new DefaultCrawler();
     return(null);
 }