示例#1
0
    /// <summary>
    /// 获得工程名
    /// </summary>
    /// <returns></returns>
    string GetProjectName()
    {
        var e = new EntityProperty();

        e.PropertyName            = "工程名称";
        e.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" };
        e.LeadingColonKeyWordCandidatePreprocess  = TrimEndJianCheng;
        e.QuotationTrailingWordList_IsSkipBracket = true;
        e.QuotationTrailingWordList = new string[] {
            "标段施工项目", "标段土建工程", "标段施工总承包", "标段的工程", "标段工程", "标段施工总价承包",
            "标段施工总承包工程", "标段施工工程", "标段土建工程建设项目", "标段站前工程", "标段工程(施工)",
            "工程施工工程", "项目施工工程", "施工工程",
            "工程项目", "工程标段", "标段的施工项目", "标段项目", "标段施工",
            "招标采购项目", "招标活动", "采购活动", "招标项目",
            "项目", "采购", "总承包",
            "工程", "标段", "标",
        };
        e.Extract(this);
        var prj = e.EvaluateCI();

        if (!String.IsNullOrEmpty(prj))
        {
            return(prj);
        }

        //var Stardard = TraningDataset.ContractList.Where(x => x.Id == this.Id).ToList();
        //if (Stardard.Count == 1)
        //{
        //Console.WriteLine("标准答案:" + Stardard[0].ProjectName);
        //}

        //var ProjectNameList = ProjectNameLogic.GetProjectNameByCutWord(this);
        //var ProjectNameListNER = ProjectNameLogic.GetProjectNameByNer(this);

        var StartArray = new string[] { "公司为", "参与了", "确定为" };
        var EndArray   = new string[] { "的中标单位", "的公开招投标", "的中标人", "候选人" };

        e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        e.Extract(this);
        prj = e.EvaluateCI();
        if (!String.IsNullOrEmpty(prj))
        {
            if (ExtractPropertyByHTML.FindWordCnt(prj + "项目", root).Count >= 1)
            {
                return(prj + "项目");
            }
            return(prj);
        }

        foreach (var item in quotationList)
        {
            if (item.Value.Contains("推荐的中标候选人公示"))
            {
                prj = Utility.GetStringBefore(item.Value, "推荐的中标候选人公示");
                return(prj);
            }
        }
        return(string.Empty);
    }
示例#2
0
    List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root)
    {
        var result = new List <string>();

        foreach (var item in KeyWordMap)
        {
            var cnt = ExtractPropertyByHTML.FindWordCnt(item.Key, root).Count;
            if (cnt > 0)
            {
                if (!result.Contains(item.Value))
                {
                    result.Add(item.Value);
                }
            }
        }
        return(result);
    }
示例#3
0
    List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root)
    {
        var result = new List <string>();

        foreach (var item in KeyWordMap)
        {
            var HasKey = ExtractPropertyByHTML.HasWord(item.Key, root);
            if (HasKey)
            {
                if (!result.Contains(item.Value))
                {
                    result.Add(item.Value);
                }
            }
        }
        return(result);
    }
示例#4
0
文件: Contract.cs 项目: lxxwin/FDDC
    /// <summary>
    /// 获得乙方
    /// </summary>
    /// <returns></returns>
    string GetYiFang()
    {
        var Extractor = new ExtractPropertyByText();

        //这些关键字后面
        Extractor.LeadingColonKeyWordList = new string[] { "乙方:" };
        //"供应商名称:","中标单位:","中标人:","中标单位:","中标人:","乙方(供方):","承包人:","承包方:","中标方:","供应商名称:","中标人名称:"
        Extractor.ExtractFromTextFile(TextFileName);
        foreach (var item in Extractor.CandidateWord)
        {
            var YiFang = item.Value.Trim();
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("乙方候补词(关键字):[" + YiFang + "]");
            }
            return(YiFang);
        }

        //乙方:"有限公司"
        //如果有子公司的话,优先使用子公司
        foreach (var c in companynamelist)
        {
            if (c.isSubCompany)
            {
                return(c.secFullName);
            }
        }

        var ExtractorHTML = new ExtractPropertyByHTML();

        //这些关键字后面
        ExtractorHTML.TrailingWordList = new string[] { "有限公司董事会" };
        ExtractorHTML.Extract(root);
        ExtractorHTML.CandidateWord.Reverse();
        foreach (var item in ExtractorHTML.CandidateWord)
        {
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("乙方候补词(关键字):[" + item.Value.Trim() + "有限公司]");
            }
            return(item.Value.Trim() + "有限公司");
        }
        return(AnnouceCompanyName);
    }
示例#5
0
    /// <summary>
    /// 股数
    /// </summary>
    /// <param name="root"></param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateStockNumber(HTMLEngine.MyRootHtmlNode root)
    {
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            RegularExpress   = @"\d+(,\d+)+",
            TrailingWordList = new string[] { "股" }.ToList()
        };
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var ExpResult = ExtractPropertyByHTML.RegularExFinder(sentence.PositionId, sentence.Content, targetRegular, "|");
                list.AddRange(ExpResult);
            }
        }
        return(list);
    }
示例#6
0
文件: Contract.cs 项目: lxxwin/FDDC
    /// <summary>
    /// 获得甲方
    /// </summary>
    /// <returns></returns>
    public string GetJiaFang()
    {
        //最高置信度的抽取
        EntityProperty e = new EntityProperty();

        e.ExcludeContainsWordList = new string[] { "招标代理" };
        e.LeadingColonKeyWordList = new string[] {
            "甲方:", "合同买方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:"
        };
        e.CandidatePreprocess = (x =>
        {
            x = Normalizer.ClearTrailing(x);
            return(CompanyNameLogic.AfterProcessFullName(x).secFullName);
        });
        e.MaxLength = ContractTraning.JiaFangES.MaxLength;
        e.MaxLengthCheckPreprocess = Utility.TrimEnglish;
        e.MinLength = 3;
        e.Extract(this);

        //这里不直接做Distinct,出现频次越高,则可信度越高
        //多个甲方的时候,可能意味着没有甲方!
        if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1)
        {
            foreach (var candidate in e.LeadingColonKeyWordCandidate)
            {
                Program.Logger.WriteLine("发现多个甲方:" + candidate);
            }
        }
        if (e.LeadingColonKeyWordCandidate.Count > 0)
        {
            return(e.LeadingColonKeyWordCandidate[0]);
        }


        //招标
        var Extractor     = new ExtractPropertyByHTML();
        var CandidateWord = new List <String>();
        var StartArray    = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray      = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim();
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }

        //合同
        Extractor  = new ExtractPropertyByHTML();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }
        return(CompanyNameLogic.MostLikeCompanyName(CandidateWord));
    }
示例#7
0
    public void Extract(AnnouceDocument doc)
    {
        //纯关键字类型
        if (KeyWordMap.Count != 0)
        {
            var candidate = ExtractByKeyWordMap(doc.root);
            if (candidate.Count == 1)
            {
                WordMapResult = candidate.First();
            }
            if (candidate.Count > 1)
            {
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine("找到纯关键字类型两个关键字");
                }
            }
            return;
        }

        if (LeadingColonKeyWordList != null)
        {
            //按照规则,由固定先导词的,例如  [项目名:]
            //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器
            var ExtractorText = new ExtractPropertyByText();
            //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍
            ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList;
            ExtractorText.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                LeadingColonKeyWordCandidate.Add(PropertyValue);
            }

            var Extractor = new ExtractPropertyByHTML();
            Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
            Extractor.Extract(doc.root);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                //TEXT里面有的,这里不重复添加了
                if (!LeadingColonKeyWordCandidate.Contains(PropertyValue))
                {
                    LeadingColonKeyWordCandidate.Add(PropertyValue);
                }
            }
        }

        //书名号和引号
        if (QuotationTrailingWordList != null)
        {
            //接下来《》,“” 优先
            foreach (var bracket in doc.quotationList)
            {
                foreach (var word in QuotationTrailingWordList)
                {
                    if (bracket.Value.EndsWith(word))
                    {
                        var PropertyValue = CheckCandidate(bracket.Value);
                        if (String.IsNullOrEmpty(PropertyValue))
                        {
                            continue;
                        }
                        if (!Program.IsMultiThreadMode)
                        {
                            Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                        }
                        QuotationTrailingCandidate.Add(PropertyValue);
                    }
                }
            }
        }

        //句法依存
        if (DpKeyWordList != null)
        {
            var ExtractDP = new ExtractPropertyByDP();
            ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist);
            foreach (var item in ExtractDP.CandidateWord)
            {
                var PropertyValue = CheckCandidate(item.Value);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                DpKeyWordCandidate.Add(PropertyValue);
            }
        }

        if (ExternalStartEndStringFeature != null)
        {
            var ExtractorTEXT = new ExtractPropertyByText();
            ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorTEXT.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorTEXT.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
            }

            //一部分无法提取TEXT的情况
            var ExtractorHTML = new ExtractPropertyByHTML();
            ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorHTML.Extract(doc.root);
            foreach (var item in ExtractorHTML.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue))
                {
                    ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
                }
            }
        }
    }
示例#8
0
    /// <summary>
    /// 获得工程名
    /// </summary>
    /// <returns></returns>
    string GetProjectName()
    {
        var ExtractorText = new ExtractPropertyByText();

        //这些关键字后面(最优先)
        ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" };
        ExtractorText.ExtractFromTextFile(TextFileName);
        foreach (var item in ExtractorText.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        var Extractor = new ExtractPropertyByHTML();

        Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        foreach (var bracket in quotationList)
        {
            if (bracket.Value.EndsWith("工程") ||
                bracket.Value.EndsWith("标段"))
            {
                return(bracket.Value);
            }
        }

        var MarkFeature = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeature.MarkStartWith = "“";
        MarkFeature.MarkEndWith   = "”";
        MarkFeature.InnerEndWith  = "标段";

        var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeatureConfirm.MarkStartWith = "“";
        MarkFeatureConfirm.MarkEndWith   = "”";
        MarkFeatureConfirm.InnerEndWith  = "标";

        Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]");
            }
            return(ProjectName);
        }

        var ExtractDP = new ExtractPropertyByDP();
        var KeyList   = new List <ExtractPropertyByDP.DPKeyWord>();

        KeyList.Add(new ExtractPropertyByDP.DPKeyWord()
        {
            StartWord    = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" },
            StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 },
            EndWord      = new string[] { "采购", "项目", "工程", "标段" },
            EndDPValue   = new string[] { }
        });
        ExtractDP.StartWithKey(KeyList, Dplist);
        foreach (var item in ExtractDP.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength)
            {
                continue;
            }
            if (ProjectName.Length <= 4)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        return(String.Empty);
    }
示例#9
0
    /// <summary>
    /// 从释义表抽取数据
    /// </summary>
    /// <param name="Target"></param>
    /// <param name="Comany"></param>
    /// <returns></returns>
    private List <(string Target, string Company)> ExtractTargetFromExplainTable(List <struCompanyName> CompanyAtExplainTable, string[] ExplainKeys)
    {
        var AllCompanyName = new List <String>();

        foreach (var item in CompanyAtExplainTable)
        {
            if (!String.IsNullOrEmpty(item.secShortName))
            {
                AllCompanyName.Add(item.secShortName);
            }
            if (!String.IsNullOrEmpty(item.secFullName))
            {
                AllCompanyName.Add(item.secFullName);
            }
        }

        //股份的抽取
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            LeadingWordList  = AllCompanyName,
            RegularExpress   = RegularTool.PercentExpress,
            TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益", "的股份", "股份" }.ToList()
        };


        //其他标的
        var OtherTargets = new string[] { "资产及负债", "资产和负债",
                                          "主要资产和部分负债", "主要资产及部分负债",
                                          "经营性资产及负债", "经营性资产和负债", "应收账款和其他应收款",
                                          "负债", "债权", "全部权益", "经营性资产", "非股权类资产", "资产、负债、业务",
                                          "直属资产", "普通股股份", "土地使用权", "使用权", "房产" };

        var TargetAndCompanyList = new List <(string Target, string Comany)>();

        foreach (var Rplkey in ExplainKeys)
        {
            //可能性最大的排在最前
            foreach (var ExplainDictItem in ExplainDict)
            {
                var keys  = ExplainDictItem.Key.Split(Utility.SplitChar);
                var keys2 = ExplainDictItem.Key.Split(new char[] { '/', '/' });
                if (keys.Length == 1 && keys2.Length > 1)
                {
                    keys = keys2;
                }
                var values  = ExplainDictItem.Value.Split(Utility.SplitChar);
                var values2 = ExplainDictItem.Value.Split(";");
                if (values.Length == 1 && values2.Length > 1)
                {
                    values = values2;
                }

                //keys里面可能包括【拟】字需要去除
                var SearchKey = keys.Select((x) => { return(x.StartsWith("拟") ? x.Substring(1) : x); });
                SearchKey = SearchKey.Select(x => x.Trim()).ToArray();
                if (SearchKey.Contains(Rplkey))
                {
                    if (Rplkey.Equals("交易标的") || Rplkey.Equals("标的资产") || Rplkey.Equals("标的公司"))
                    {
                        foreach (var cn in companynamelist)
                        {
                            if (ExplainDictItem.Value.Equals(cn.secFullName) ||
                                ExplainDictItem.Value.Equals(cn.secShortName))
                            {
                                var extra = ("100%股权", ExplainDictItem.Value);
                                TargetAndCompanyList.Add(extra);
                                Console.WriteLine(Id + ":100%股权" + ExplainDictItem.Value);
                                return(TargetAndCompanyList);
                            }
                        }
                    }
                    foreach (var targetRecordItem in values)
                    {
                        var SingleItemList = Utility.CutByPOSConection(targetRecordItem);
                        foreach (var SingleItem in SingleItemList)
                        {
                            var targetAndcompany = SingleItem.Trim().Replace(" ", "");
                            targetAndcompany = targetAndcompany.Trim().Replace("合计", "");
                            if (targetAndcompany.Contains("持有的"))
                            {
                                targetAndcompany = Utility.GetStringAfter(targetAndcompany, "持有的");
                            }
                            if (targetAndcompany.Contains("持有"))
                            {
                                targetAndcompany = Utility.GetStringAfter(targetAndcompany, "持有");
                            }
                            if (targetAndcompany.Contains("所持"))
                            {
                                targetAndcompany = Utility.GetStringAfter(targetAndcompany, "所持");
                            }

                            //将公司名称和交易标的划分开来
                            var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, targetAndcompany, targetRegular, "|");
                            if (ExpResult.Count == 0)
                            {
                                //其他类型的标的
                                if (!String.IsNullOrEmpty(GetOtherOwnerByExplainTable(targetAndcompany)))
                                {
                                    var extra = (targetAndcompany, GetOtherOwnerByExplainTable(targetAndcompany));
                                    if (!TargetAndCompanyList.Contains(extra))
                                    {
                                        TargetAndCompanyList.Add(extra);
                                    }
                                }
                                else
                                {
                                    foreach (var rc in CompanyAtExplainTable)
                                    {
                                        var IsFullNameHit = false;
                                        //资产里面可能是带有公司名字的情况
                                        if (!String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.Contains(rc.secFullName))
                                        {
                                            foreach (var ot in OtherTargets)
                                            {
                                                if (targetAndcompany.Contains(ot))
                                                {
                                                    IsFullNameHit = true;
                                                    TargetAndCompanyList.Add((ot, rc.secFullName));
                                                    break;
                                                }
                                            }
                                        }
                                        if (!IsFullNameHit)
                                        {
                                            if (!String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.Contains(rc.secShortName))
                                            {
                                                foreach (var ot in OtherTargets)
                                                {
                                                    if (targetAndcompany.Contains(ot))
                                                    {
                                                        IsFullNameHit = true;
                                                        TargetAndCompanyList.Add((ot, rc.secFullName));
                                                        break;
                                                    }
                                                }
                                            }
                                        }
                                        //XXXX持有的XXXX的形式,不过现在可能已经不用了
                                        if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.StartsWith(rc.secFullName))
                                        {
                                            var extra = (targetAndcompany.Substring(rc.secFullName.Length), rc.secFullName);
                                            if (!TargetAndCompanyList.Contains(extra))
                                            {
                                                TargetAndCompanyList.Add(extra);
                                            }
                                            break;
                                        }
                                        if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.StartsWith(rc.secShortName))
                                        {
                                            var extra = (targetAndcompany.Substring(rc.secShortName.Length), rc.secShortName);
                                            if (!TargetAndCompanyList.Contains(extra))
                                            {
                                                TargetAndCompanyList.Add(extra);
                                            }
                                            break;
                                        }
                                    }
                                }
                            }
                            else
                            {
                                foreach (var r in ExpResult)
                                {
                                    var arr           = r.Value.Split("|");
                                    var target        = arr[1] + arr[2];
                                    var targetCompany = arr[0];
                                    if (targetCompany.Contains("持有的"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "持有的");
                                    }
                                    if (targetCompany.Contains("持有"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "持有");
                                    }
                                    if (targetCompany.Contains("所持"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "所持");
                                    }
                                    var extra = (target.Replace(" ", ""), targetCompany.Replace(" ", ""));
                                    if (!TargetAndCompanyList.Contains(extra))
                                    {
                                        TargetAndCompanyList.Add(extra);
                                    }
                                }
                            }
                        }
                    }
                    if (TargetAndCompanyList.Count != 0)
                    {
                        return(TargetAndCompanyList);
                    }
                }
            }
        }
        return(TargetAndCompanyList);
    }
示例#10
0
    private List <(string Target, string Company)> ExtractExtend(string[] ExplainKeys)
    {
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            RegularExpress   = RegularTool.PercentExpress,
            TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益", "的股份", "股份" }.ToList()
        };
        var Result = new List <(string Target, string Comany)>();

        //可能性最大的排在最前
        foreach (var item in ExplainDict)
        {
            var list  = new List <(string Target, string Comany)>();
            var keys  = item.Key.Split(Utility.SplitChar);
            var keys2 = item.Key.Split(new char[] { '/', '/' });
            if (keys.Length == 1 && keys2.Length > 1)
            {
                keys = keys2;
            }
            var values  = item.Value.Split(Utility.SplitChar);
            var values2 = item.Value.Split(";");
            if (values.Length == 1 && values2.Length > 1)
            {
                values = values2;
            }
            foreach (var ek in ExplainKeys)
            {
                if (keys.Contains(ek))
                {
                    foreach (var value in values)
                    {
                        var serachWord = value.Replace(" ", string.Empty);
                        foreach (var words in serachWord.Split(Utility.SplitChar))
                        {
                            var SingleItemList = Utility.CutByPOSConection(words);
                            foreach (var SingleItem in SingleItemList)
                            {
                                var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, SingleItem, targetRegular, "|");
                                foreach (var r in ExpResult)
                                {
                                    var arr           = r.Value.Split("|");
                                    var target        = arr[1] + arr[2];
                                    var targetCompany = SingleItem.Substring(0, r.StartIdx);
                                    if (targetCompany.Contains("持有的"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "持有的");
                                    }
                                    if (targetCompany.Contains("持有"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "持有");
                                    }
                                    if (targetCompany.Contains("所持"))
                                    {
                                        targetCompany = Utility.GetStringAfter(targetCompany, "所持");
                                    }
                                    var extra = (target, targetCompany);
                                    list.Add(extra);
                                }
                            }
                        }
                    }
                    if (list.Count != 0)
                    {
                        return(list.Distinct().ToList());
                    }
                }
            }
        }

        return(Result);
    }
示例#11
0
    /// <summary>
    /// 从释义表格中抽取
    /// </summary>
    /// <returns></returns>
    List <(string Target, string Comany)> getTargetListFromReplaceTable()
    {
        var ReplaceCompany = new List <String>();

        foreach (var c in companynamelist)
        {
            if (c.positionId == -1)
            {
                //释义表
                if (!String.IsNullOrEmpty(c.secShortName))
                {
                    ReplaceCompany.Add(c.secShortName);
                }
            }
        }

        var TargetAndCompanyList = new List <(string Target, string Comany)>();
        //股份的抽取
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            LeadingWordList  = ReplaceCompany,
            RegularExpress   = RegularTool.PercentExpress,
            TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益" }.ToList()
        };
        var ReplacementKeys = new string[]
        {
            "交易标的",     //09%	00303
            "标的资产",     //15%	00464
            "本次交易",     //12%	00369
            "本次重组",     //09%	00297
            "拟购买资产",    //07%	00221
            "本次重大资产重组", //07%	 00219
            "置入资产",     //03%	00107
            "本次发行",     //02%	00070
            "拟注入资产",    //02%	00068
            "目标资产"      //02%	00067
        };

        foreach (var Rplkey in ReplacementKeys)
        {
            //可能性最大的排在最前
            foreach (var item in ReplacementDict)
            {
                var keys  = item.Key.Split(Utility.SplitChar);
                var keys2 = item.Key.Split("/");
                if (keys.Length == 1 && keys2.Length > 1)
                {
                    keys = keys2;
                }
                var values  = item.Value.Split(Utility.SplitChar);
                var values2 = item.Value.Split(";");
                if (values.Length == 1 && values2.Length > 1)
                {
                    values = values2;
                }
                if (keys.Contains(Rplkey))
                {
                    foreach (var value in values)
                    {
                        var targetAndcompany = value.Trim();
                        //将公司名称和交易标的划分开来
                        var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, value, targetRegular, "|");
                        if (ExpResult.Count == 0)
                        {
                            //其他类型的标的
                            foreach (var rc in ReplaceCompany)
                            {
                                if (targetAndcompany.StartsWith(rc))
                                {
                                    var extra = (value.Substring(rc.Length), rc);
                                    if (!TargetAndCompanyList.Contains(extra))
                                    {
                                        TargetAndCompanyList.Add(extra);
                                    }
                                    break;
                                }
                            }
                        }
                        else
                        {
                            foreach (var r in ExpResult)
                            {
                                var arr   = r.Value.Split("|");
                                var extra = (arr[1] + arr[2], arr[0]);
                                if (!TargetAndCompanyList.Contains(extra))
                                {
                                    TargetAndCompanyList.Add(extra);
                                }
                            }
                        }
                    }
                    if (TargetAndCompanyList.Count != 0)
                    {
                        return(TargetAndCompanyList);
                    }
                }
            }
        }
        return(TargetAndCompanyList);
    }
示例#12
0
    /// <summary>
    /// 从释义表抽取数据
    /// </summary>
    /// <param name="Target"></param>
    /// <param name="Comany"></param>
    /// <returns></returns>
    private List <(string Target, string Comany)> ExtractFromExplainTable(List <struCompanyName> CompanyAtExplainTable, string[] ExplainKeys)
    {
        var AllCompanyName = new List <String>();

        foreach (var item in CompanyAtExplainTable)
        {
            if (!String.IsNullOrEmpty(item.secShortName))
            {
                AllCompanyName.Add(item.secShortName);
            }
            if (!String.IsNullOrEmpty(item.secFullName))
            {
                AllCompanyName.Add(item.secFullName);
            }
        }

        //股份的抽取
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            LeadingWordList  = AllCompanyName,
            RegularExpress   = RegularTool.PercentExpress,
            TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益" }.ToList()
        };


        var OtherTargets = new string[] { "资产及负债", "直属资产" };

        var TargetAndCompanyList = new List <(string Target, string Comany)>();

        foreach (var Rplkey in ExplainKeys)
        {
            //可能性最大的排在最前
            foreach (var item in ExplainDict)
            {
                var keys  = item.Key.Split(Utility.SplitChar);
                var keys2 = item.Key.Split("/");
                if (keys.Length == 1 && keys2.Length > 1)
                {
                    keys = keys2;
                }
                var values  = item.Value.Split(Utility.SplitChar);
                var values2 = item.Value.Split(";");
                if (values.Length == 1 && values2.Length > 1)
                {
                    values = values2;
                }

                //keys里面可能包括【拟】字需要去除
                var SearchKey = keys.Select((x) => { return(x.StartsWith("拟") ? x.Substring(1) : x); });
                SearchKey = SearchKey.Select(x => x.Trim()).ToArray();
                if (SearchKey.Contains(Rplkey))
                {
                    foreach (var targetRecordItem in values)
                    {
                        //DEBUG:
                        var SingleItemList = Utility.CutByPOSConection(targetRecordItem);
                        if (SingleItemList.Count == 2)
                        {
                            //1.家和股份  和的问题
                            //2.空格问题
                            //3.置入和置出问题
                            //4.其他奇怪的问题
                            //5.资产和负债
                            //6.所拥有的,所持有的
                            //Console.WriteLine(Id + " 分割:");
                            //Console.WriteLine(Id + " 原词:" + targetRecordItem);
                            //Console.WriteLine(Id + " 分量1:" + SingleItemList[0]);
                            //Console.WriteLine(Id + " 分量2:" + SingleItemList[1]);
                        }
                        foreach (var SingleItem in SingleItemList)
                        {
                            var targetAndcompany = SingleItem.Trim().Replace(" ", "");
                            //将公司名称和交易标的划分开来
                            var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, targetAndcompany, targetRegular, "|");
                            if (ExpResult.Count == 0)
                            {
                                //其他类型的标的
                                foreach (var rc in CompanyAtExplainTable)
                                {
                                    var IsFullNameHit = false;
                                    if (!String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.Contains(rc.secFullName))
                                    {
                                        foreach (var ot in OtherTargets)
                                        {
                                            if (targetAndcompany.Contains(ot))
                                            {
                                                IsFullNameHit = true;
                                                TargetAndCompanyList.Add((rc.secFullName, ot));
                                                break;
                                            }
                                        }
                                    }

                                    if (!IsFullNameHit)
                                    {
                                        if (!String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.Contains(rc.secShortName))
                                        {
                                            foreach (var ot in OtherTargets)
                                            {
                                                if (targetAndcompany.Contains(ot))
                                                {
                                                    IsFullNameHit = true;
                                                    TargetAndCompanyList.Add((rc.secShortName, ot));
                                                    break;
                                                }
                                            }
                                        }
                                    }

                                    if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.StartsWith(rc.secFullName))
                                    {
                                        var extra = (SingleItem.Substring(rc.secFullName.Length), rc.secFullName);
                                        if (!TargetAndCompanyList.Contains(extra))
                                        {
                                            TargetAndCompanyList.Add(extra);
                                        }
                                        break;
                                    }
                                    if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.StartsWith(rc.secShortName))
                                    {
                                        var extra = (SingleItem.Substring(rc.secShortName.Length), rc.secShortName);
                                        if (!TargetAndCompanyList.Contains(extra))
                                        {
                                            TargetAndCompanyList.Add(extra);
                                        }
                                        break;
                                    }
                                }
                            }
                            else
                            {
                                foreach (var r in ExpResult)
                                {
                                    var arr   = r.Value.Split("|");
                                    var extra = (arr[1] + arr[2], arr[0]);
                                    if (!TargetAndCompanyList.Contains(extra))
                                    {
                                        TargetAndCompanyList.Add(extra);
                                    }
                                }
                            }
                        }
                    }
                    if (TargetAndCompanyList.Count != 0)
                    {
                        return(TargetAndCompanyList);
                    }
                }
            }
        }
        return(TargetAndCompanyList);
    }
示例#13
0
    /// <summary>
    /// 获得甲方
    /// </summary>
    /// <returns></returns>
    string GetJiaFang(String YiFang)
    {
        //最高置信度的抽取
        EntityProperty e = new EntityProperty();

        e.ExcludeContainsWordList = new string[] { "招标代理" };
        e.LeadingColonKeyWordList = new string[] {
            "甲方:", "合同买方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "项目招标人:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:"
        };
        e.CandidatePreprocess = (x =>
        {
            x = Normalizer.ClearTrailing(x);
            return(CompanyNameLogic.AfterProcessFullName(x).secFullName);
        });
        e.MaxLength = 32;
        e.MaxLengthCheckPreprocess = Utility.TrimEnglish;
        e.MinLength = 3;
        e.Extract(this);
        //这里不直接做Distinct,出现频次越高,则可信度越高
        //多个甲方的时候,可能意味着没有甲方!
        if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1)
        {
            foreach (var candidate in e.LeadingColonKeyWordCandidate)
            {
                Program.Logger.WriteLine("发现多个甲方:" + candidate);
            }
        }
        if (e.LeadingColonKeyWordCandidate.Count > 0)
        {
            return(e.LeadingColonKeyWordCandidate[0]);
        }

        var ner    = SearchJiaFang();
        var NerJia = String.Empty;

        if (!String.IsNullOrEmpty(ner))
        {
            foreach (var cn in companynamelist)
            {
                if (cn.secShortName == ner)
                {
                    ner = cn.secFullName;
                }
            }
            if (String.IsNullOrEmpty(YiFang))
            {
                NerJia = ner;
            }
            if (!YiFang.Equals(ner))
            {
                NerJia = ner;
            }
        }

        //招标
        var Extractor     = new ExtractPropertyByHTML();
        var CandidateWord = new List <String>();
        var StartArray    = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray      = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim();
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }

        //合同
        Extractor  = new ExtractPropertyByHTML();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }
        if (!String.IsNullOrEmpty(NerJia))
        {
            //原则上,有NER中提取的甲方,则使用甲方
            foreach (var c in CandidateWord)
            {
                //但是,这里有可能是正确的解答,例如
                //NER:(集团)有限公司 实际上应该是 XXXX(集团)有限公司
                if (c.EndsWith(NerJia))
                {
                    return(c);
                }
            }
            return(NerJia);
        }
        else
        {
            return(CompanyNameLogic.MostLikeCompanyName(CandidateWord));
        }
    }