public IEnumerable<MedStandardInfo> Find(string keywords, IProgress<string> progress) { using (var directory = GetDirectory()) using (var searcher = new IndexSearcher(directory)) { var query = GetQuery(keywords); var sort = GetSort(); var docs = searcher.Search(query, null, 1000, sort); var result = new List<MedStandardInfo>(); foreach (var scoreDoc in docs.ScoreDocs) { var doc = searcher.Doc(scoreDoc.Doc); var product = new MedStandardInfo() { StandardName = doc.Get("StandardName"), FileName = doc.Get("FileName"), Mkb = doc.Get("Mkb"), OrderNum = doc.Get("OrderNum"), }; result.Add(product); } return result; } }
public async Task<MedStandardInfo> ReadPdfAsync(string fileName, CancellationToken cancellationToken, IProgress<string> progress = null) { if (File.Exists(fileName)) { progress = progress ?? new Progress<string>(); progress.Report($@"Чтение текста файла {System.IO.Path.GetFileName(fileName)} ..."); var text = new StringBuilder(); await Task.Run(() =>{ using (PdfReader reader = new PdfReader(fileName)) for (int i = 1; i <= reader.NumberOfPages; i++) text.Append(PdfTextExtractor.GetTextFromPage(reader, i)); }, cancellationToken); var result = new MedStandardInfo(); result.FileName = System.IO.Path.GetFileName(fileName); result.Text = text.ToString(); var lines = result.Text.ToLower().Split(new [] { '\n'}).Select(x => x.Trim()).ToList(); var regNum = new Regex(@"№\s*\d+\s*(н)?"); var ordNumLineIndex = lines.FindIndex(x => regNum.IsMatch(x)); if (ordNumLineIndex >= 0) { var num = regNum.Match(lines[ordNumLineIndex]).Value; result.OrderNum = num.Replace(@"№", "").Trim(); } var lineIndex = lines.FindIndex(x => x.Contains(@"зарегистрировано")); if (lineIndex >= 0) { lineIndex = lines.FindIndex(lineIndex, x => x.Contains(@"стандарт")); if (lineIndex >= 0) { do { result.StandardName += " " + lines[lineIndex++]; } while (!lines[lineIndex].Contains(":") && !lines[lineIndex].Contains("мероприят")); result.StandardName = result.StandardName.Trim().Replace(" ", " "); lineIndex = lines.FindIndex(0, x => x.Contains("код по мкб") || x.Contains("нозолог")); if (lineIndex >= 0) { var reg = new Regex(@"[a-z,а-я]\d{1,2}(\.\d{1,2})?"); for (int i = lineIndex; i < lines.Count; i++) { foreach (var match in reg.Matches(lines[i]).OfType<Match>()) { result.Mkb += " " + match.Value; } if(lines[i].Contains("мероприят") || lines[i].Contains("услуги")) break; } if (!string.IsNullOrEmpty(result.Mkb)) { result.Mkb = Translit(result.Mkb.Trim().Replace(" ", " ").ToUpper()); result.Text = string.Join(" ", lines.GetRange(0, lineIndex)); } } else progress.Report(@"Не найдена секция диагнозов"); } else { progress.Report(@"Не найдено слово СТАНДАРТ"); } } return result; } else throw new FileNotFoundException($@"Файл {fileName} не найден", fileName); }
private Document MapMedStandard(MedStandardInfo medStandard) { var document = new Document(); document.Add(new Field("StandardName", medStandard.StandardName, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("Text", medStandard.Text, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("Mkb", medStandard.Mkb, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("OrderNum", medStandard.OrderNum, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.Add(new Field("FileName", medStandard.FileName, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); return document; }