public void Extract(AnnouceDocument doc) { //纯关键字类型 if (KeyWordMap.Count != 0) { var candidate = ExtractByKeyWordMap(doc.root); if (candidate.Count == 1) { WordMapResult = candidate.First(); } if (candidate.Count > 1) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("找到纯关键字类型两个关键字"); } } return; } if (LeadingColonKeyWordList != null) { //按照规则,由固定先导词的,例如 [项目名:] //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器 var ExtractorText = new ExtractPropertyByText(); //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍 ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList; ExtractorText.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } LeadingColonKeyWordCandidate.Add(PropertyValue); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; Extractor.Extract(doc.root); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } //TEXT里面有的,这里不重复添加了 if (!LeadingColonKeyWordCandidate.Contains(PropertyValue)) { LeadingColonKeyWordCandidate.Add(PropertyValue); } } } //书名号和引号 if (QuotationTrailingWordList != null) { //接下来《》,“” 优先 foreach (var bracket in doc.quotationList) { foreach (var word in QuotationTrailingWordList) { if (bracket.Value.EndsWith(word)) { var PropertyValue = CheckCandidate(bracket.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } QuotationTrailingCandidate.Add(PropertyValue); } } } } //句法依存 if (DpKeyWordList != null) { var ExtractDP = new ExtractPropertyByDP(); ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist); foreach (var item in ExtractDP.CandidateWord) { var PropertyValue = CheckCandidate(item.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } DpKeyWordCandidate.Add(PropertyValue); } } if (ExternalStartEndStringFeature != null) { var ExtractorTEXT = new ExtractPropertyByText(); ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature; ExtractorTEXT.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorTEXT.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } //一部分无法提取TEXT的情况 var ExtractorHTML = new ExtractPropertyByHTML(); ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature; ExtractorHTML.Extract(doc.root); foreach (var item in ExtractorHTML.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue)) { ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } } } }
/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var ExtractorText = new ExtractPropertyByText(); //这些关键字后面(最优先) ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; ExtractorText.ExtractFromTextFile(TextFileName); foreach (var item in ExtractorText.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } foreach (var bracket in quotationList) { if (bracket.Value.EndsWith("工程") || bracket.Value.EndsWith("标段")) { return(bracket.Value); } } var MarkFeature = new ExtractPropertyByHTML.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); } return(ProjectName); } var ExtractDP = new ExtractPropertyByDP(); var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" }, StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "采购", "项目", "工程", "标段" }, EndDPValue = new string[] { } }); ExtractDP.StartWithKey(KeyList, Dplist); foreach (var item in ExtractDP.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength) { continue; } if (ProjectName.Length <= 4) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]"); } return(ProjectName); } return(String.Empty); }