public static void RunWordAnlayze() { var s0 = "华陆工程(科技)有限责任公司"; JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord("华陆工程科技有限责任公司"); segmenter.AddWord("中煤陕西榆林能源化工有限公司"); PosSegmenter posSeg = new PosSegmenter(segmenter); var c = posSeg.Cut(s0); s0 = s0.NormalizeTextResult(); s0 = RegularTool.TrimBrackets(s0); /* var SProjectName = new Surround(); var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); var Contract = TraningDataset.GetContractById("1044779")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract = TraningDataset.GetContractById("1450")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract = TraningDataset.GetContractById("1042224")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); Contract = TraningDataset.GetContractById("917362")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); SProjectName.WriteTop(10); */ var TestString = "承运市"; var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter(); foreach (var item in pos.Cut(TestString)) { Console.WriteLine(item.Word + ":" + item.Flag); } }
ContractRec ExtractSingle() { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new ContractRec(); //公告ID contract.Id = Id; //乙方 contract.YiFang = GetYiFang(); if (contract.YiFang.Contains("本公司")) { contract.YiFang = string.Empty; } contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); if (contract.YiFang.Length < 3) { contract.YiFang = string.Empty; } //甲方 contract.JiaFang = GetJiaFang(contract.YiFang); if (contract.JiaFang.Contains("本公司")) { contract.JiaFang = string.Empty; } contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (contract.JiaFang.Contains("简称")) { contract.JiaFang = Utility.GetStringBefore(contract.JiaFang, "("); } //机构列表 if (Nerlist != null) { var NiList = Nerlist.Where((n) => n.Type == LTPTrainingNER.enmNerType.Ni).Select((m) => m.RawData); if (!NiList.Contains(contract.JiaFang)) { if (NiList.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } } //项目 contract.ProjectName = GetProjectName(); contract.ProjectName = contract.ProjectName.NormalizeTextResult(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } if (contract.ProjectName.EndsWith(")")) { if (contract.ProjectName.Contains("(招标编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(招标编号"); } if (contract.ProjectName.Contains("(合同编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(合同编号"); } } contract.ProjectName = contract.ProjectName.Replace("的推荐中标", ""); //特殊处理 contract.ProjectName = contract.ProjectName.Replace("<1>", "1、"); contract.ProjectName = contract.ProjectName.Replace("“", ""); contract.ProjectName = contract.ProjectName.Replace("”", ""); //合同名 contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); contract.ContractName = contract.ContractName.NormalizeTextResult(); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = ExtendContractName(contract.ContractName); //如果是采购协议,则工程名清空 if (contract.ContractName.Contains("采购")) { if (contract.ProjectName.Contains("标段")) { //TODO: } else { contract.ProjectName = string.Empty; } } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); var YiFangArray = contract.YiFang.Split(Utility.SplitChar); if (YiFangArray.Length > 1) { contract.UnionMember = Utility.GetStringAfter(contract.YiFang, Utility.SplitChar); contract.YiFang = YiFangArray[0]; Console.WriteLine("联合体:" + contract.UnionMember); } return(contract); }
struContract ExtractSingle(MyRootHtmlNode root, String Id) { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(); contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (!Nerlist.Contains(contract.JiaFang)) { //作为特殊单位,国家电网公司一般都是甲方 if (Nerlist.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } //乙方 contract.YiFang = GetYiFang(); contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); //项目 contract.ProjectName = GetProjectName(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } contract.ProjectName = contract.ProjectName.NormalizeTextResult(); //合同 if (contractType == "中标") { //按照数据分析来看,应该工程名 在中标的时候填写,合同名在合同的时候填写 contract.ContractName = String.Empty; } else { contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = contract.ContractName.NormalizeTextResult(); } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract.JiaFang, contract.YiFang); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); return(contract); }