Example #1
0
    //最大长度
    public static void TraningMaxLenth()
    {
        MaxJiaFangLength      = 0;
        MaxYiFangLength       = 0;
        MaxContractNameLength = 0;
        MaxProjectNameLength  = 0;
        foreach (var c in TraningDataset.ContractList)
        {
            var TEJiaFang = EntityWordAnlayzeTool.TrimEnglish(c.JiaFang);
            if (TEJiaFang.Length > MaxJiaFangLength)
            {
                MaxJiaFangLength = TEJiaFang.Length;
                MaxJiaFang       = TEJiaFang;
            }

            var TEYiFang = EntityWordAnlayzeTool.TrimEnglish(c.YiFang);
            if (TEYiFang.Length > MaxYiFangLength)
            {
                MaxYiFangLength = TEYiFang.Length;
                MaxYiFang       = TEYiFang;
            }

            var ContractList = c.ContractName.Split("、");
            foreach (var cn in ContractList)
            {
                var TEContractName = EntityWordAnlayzeTool.TrimEnglish(cn);
                if (TEContractName.Length > MaxContractNameLength)
                {
                    MaxContractNameLength = TEContractName.Length;
                    MaxContractName       = TEContractName;
                }
            }

            var ProjectNameList = c.ProjectName.Split("、");
            foreach (var jn in ProjectNameList)
            {
                if (jn.Contains(","))
                {
                    continue;
                }
                var TEProjectName = EntityWordAnlayzeTool.TrimEnglish(jn);
                if (TEProjectName.Length > MaxContractNameLength)
                {
                    MaxProjectNameLength = TEProjectName.Length;
                    MaxProjectName       = TEProjectName;
                }
            }
        }
        Program.Training.WriteLine("最大甲方(除去英语)长度:" + MaxJiaFangLength);
        Program.Training.WriteLine("最大甲方(除去英语):" + MaxJiaFang);
        Program.Training.WriteLine("最大乙方(除去英语)长度:" + MaxYiFangLength);
        Program.Training.WriteLine("最大乙方(除去英语):" + MaxYiFang);
        Program.Training.WriteLine("最大合同(除去英语)长度:" + MaxContractNameLength);
        Program.Training.WriteLine("最大合同(除去英语):" + MaxContractName);
        Program.Training.WriteLine("最大工程(除去英语)长度:" + MaxProjectNameLength);
        Program.Training.WriteLine("最大工程(除去英语):" + MaxProjectName);
        //新建北京至石家庄铁路客运专线石家庄枢纽(北京局代建部分)站场工程一个标段
        //新建大塔至四眼井铁路吴四圪堵至四眼井段站前工程wssg-1标段
    }
Example #2
0
    public List <struStockChange> Extract()
    {
        var DateRange = LocateDateRange(root);
        var list      = new List <struStockChange>();
        var Name      = GetHolderName();

        if (!String.IsNullOrEmpty(Name.FullName) && !String.IsNullOrEmpty(Name.ShortName))
        {
            companynamelist.Add(new struCompanyName()
            {
                secFullName  = Name.FullName,
                secShortName = Name.ShortName
            });
        }
        list = ExtractFromTable();
        //list = ExtractFromTableByContent();
        if (list.Count > 0)
        {
            return(list);                   //如果这里直接返回,由于召回率等因素,可以细微提高成绩
        }
        var stockchange = new struStockChange();

        //公告ID
        stockchange.id = Id;
        //if (!Program.IsMultiThreadMode) Program.Logger.WriteLine("公告ID:" + stockchange.id);
        stockchange.HolderFullName = Name.FullName.NormalizeTextResult();
        if (EntityWordAnlayzeTool.TrimEnglish(stockchange.HolderFullName).Length > ContractTraning.MaxYiFangLength)
        {
            stockchange.HolderFullName = String.Empty;
        }
        stockchange.HolderShortName = Name.ShortName;
        stockchange.ChangeEndDate   = GetChangeEndDate(root);

        DateTime x;

        if (!DateTime.TryParse(stockchange.ChangeEndDate, out x))
        {
            //无法处理的情况
            if (!Program.IsDebugMode)
            {
                //非调试模式
                stockchange.ChangeEndDate = String.Empty;
            }
        }

        if (!string.IsNullOrEmpty(stockchange.HolderFullName) && !string.IsNullOrEmpty(stockchange.ChangeEndDate))
        {
            if (!stockchange.HolderFullName.Contains("增持") && !stockchange.HolderFullName.Contains("减持"))
            {
                list.Add(stockchange);
            }
        }

        return(list);
    }
Example #3
0
    static string GetProjectName(MyRootHtmlNode root)
    {
        var Extractor = new EntityProperty();

        //这些关键字后面
        Extractor.LeadingWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            Program.Logger.WriteLine("项目名称候补词(关键字):[" + item + "]");
            return(ProjectName);
        }

        var MarkFeature = new EntityProperty.struMarkFeature();

        MarkFeature.MarkStartWith = "“";
        MarkFeature.MarkEndWith   = "”";
        MarkFeature.InnerEndWith  = "标段";

        var MarkFeatureConfirm = new EntityProperty.struMarkFeature();

        MarkFeatureConfirm.MarkStartWith = "“";
        MarkFeatureConfirm.MarkEndWith   = "”";
        MarkFeatureConfirm.InnerEndWith  = "标";

        Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]");
            return(ProjectName);
        }

        var list = BussinessLogic.GetProjectName(root);

        if (list.Count > 0)
        {
            return(list[0]);
        }
        return("");
    }
Example #4
0
    public static void RunWordAnlayze()
    {
        var root     = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html");
        var Contract = TraningDataset.GetContractById("1044779")[0];

        EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root     = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html");
        Contract = TraningDataset.GetContractById("1450")[0];
        EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root     = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html");
        Contract = TraningDataset.GetContractById("1042224")[0];
        EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root     = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html");
        Contract = TraningDataset.GetContractById("917362")[0];
        EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName);
    }
Example #5
0
    //实体自身特性分析
    public static void EntityWordPerperty()
    {
        var posSeg = new PosSegmenter();
        //首单词统计
        var FirstWordPos = new Dictionary <String, int>();
        var WordLength   = new Dictionary <int, int>();

        Program.Training.WriteLine("甲方统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.JiaFang);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();

        Program.Training.WriteLine("乙方统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.YiFang);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();


        Program.Training.WriteLine("合同统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ContractName);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();

        Program.Training.WriteLine("工程统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ProjectName);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();
    }
Example #6
0
    public static void ContractTest()
    {
        StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\20526193.html");
        StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\20596890.html");
        StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\1018217.html");
        StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\314146.html");
        IncreaseStock.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\7880.html");

        var x1 = Normalizer.NormalizeItemListNumber("(4)2012 年 4 月,公司与中国华西企业股份");
        var x2 = Normalizer.NormalizeItemListNumber("4 、承包方式: 从深化设计、制作、运输、");
        var x3 = Normalizer.NormalizeItemListNumber("4、承包方式: 从深化设计、制作、运输、");

        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1153.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1008828.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\3620.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1518.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1120707.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html");
        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html");
        IncreaseStock.Extract(@"E:\WorkSpace2018\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\7880.html");
        //数字金额的测试
        var TestString = "中标价为人民币共计16928.79754万元(大写:人民币壹亿陆仟玖佰贰拾捌万柒仟玖佰柒拾伍元肆角整)。";
        var Result     = Utility.SeekMoney(TestString);

        //Console.WriteLine(Result.Item1);

        TestString = "安徽盛运环保(集团)股份有限公司";
        //Result = Utility.GetStringBefore(TestString, "有限公司");
        //Console.WriteLine(Result);

        Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\5258.html");

        var x0 = "在此之前,2003年6月30日,本公司曾与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订了《技术许可与代理协议》,并分别于2005年11月、2006年12月和2007年 10月与MICROS相继签署了第一、二、三次补充协议。";
        var t0 = EntityWordAnlayzeTool.GetMainWordSentence(x0);
        //在此之前,2003年6月30日,本公司曾与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订了《技术许可与代理协议》,并分别于2005年11月、2006年12月和2007年 10月与MICROS相继签署了第一、二、三次补充协议。"
        //在此之前,2003年6月30日,本公司  与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订  《技术许可与代理协议》,并   于2005年11月、2006年12月和2007年 10月与MICROS    签署  第一、二、三次补充协议。
    }
Example #7
0
    public static void AnlayzeEntitySurroundWords()
    {
        var ContractPath_TRAIN = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同";

        Console.WriteLine("前导词:甲方");
        foreach (var filename in System.IO.Directory.GetFiles(ContractPath_TRAIN + @"\html\"))
        {
            var fi = new System.IO.FileInfo(filename);
            var Id = fi.Name.Replace(".html", "");
            if (TraningDataset.GetContractById(Id).Count == 0)
            {
                continue;
            }
            var contract = TraningDataset.GetContractById(Id).First();
            if (contract.JiaFang == "")
            {
                continue;
            }
            var root = HTMLEngine.Anlayze(filename);
            EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, contract.JiaFang);
        }
    }
Example #8
0
    public static struCompanyName AfterProcessFullName(string FullName)
    {
        var ShortName = String.Empty;
        var CompanyNameTrailingwords = new string[] {
            "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称"
        };

        //暂时不做括号的正规化
        foreach (var trailing in CompanyNameTrailingwords)
        {
            if (FullName.Contains(trailing))
            {
                //获取简称
                var BracketsList = RegularTool.GetChineseBrackets(FullName);
                foreach (var bracketItem in BracketsList)
                {
                    var ShortNameList = RegularTool.GetChineseQuotation(bracketItem);
                    if (ShortNameList.Count > 0)
                    {
                        ShortName = ShortNameList.First();
                        if (!String.IsNullOrEmpty(ShortName))
                        {
                            ShortName = ShortName.Substring(1, ShortName.Length - 2);
                        }
                    }
                }
                FullName = Utility.GetStringBefore(FullName, trailing);
            }
        }
        if (FullName.Contains("及其"))
        {
            FullName = Utility.GetStringBefore(FullName, "及其");
        }
        if (FullName.Contains("股东"))
        {
            FullName = Utility.GetStringAfter(FullName, "股东");
        }
        if (FullName.Contains("一致行动人"))
        {
            FullName = Utility.GetStringAfter(FullName, "一致行动人");
        }
        if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName))
        {
            FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName;
        }
        //删除前导
        FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName);
        FullName = CutOtherLeadingWords(FullName);
        if (ShortName != String.Empty)
        {
            return(new struCompanyName()
            {
                secFullName = FullName, secShortName = ShortName, Score = 80
            });
        }
        else
        {
            return(new struCompanyName()
            {
                secFullName = FullName, Score = 60
            });
        }
    }
Example #9
0
    /// <summary>
    /// 获得工程名
    /// </summary>
    /// <returns></returns>
    string GetProjectName()
    {
        var ExtractorText = new ExtractPropertyByText();

        //这些关键字后面(最优先)
        ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" };
        ExtractorText.ExtractFromTextFile(TextFileName);
        foreach (var item in ExtractorText.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        var Extractor = new ExtractPropertyByHTML();

        Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        foreach (var bracket in quotationList)
        {
            if (bracket.Value.EndsWith("工程") ||
                bracket.Value.EndsWith("标段"))
            {
                return(bracket.Value);
            }
        }

        var MarkFeature = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeature.MarkStartWith = "“";
        MarkFeature.MarkEndWith   = "”";
        MarkFeature.InnerEndWith  = "标段";

        var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeatureConfirm.MarkStartWith = "“";
        MarkFeatureConfirm.MarkEndWith   = "”";
        MarkFeatureConfirm.InnerEndWith  = "标";

        Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]");
            }
            return(ProjectName);
        }

        var ExtractDP = new ExtractPropertyByDP();
        var KeyList   = new List <ExtractPropertyByDP.DPKeyWord>();

        KeyList.Add(new ExtractPropertyByDP.DPKeyWord()
        {
            StartWord    = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" },
            StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 },
            EndWord      = new string[] { "采购", "项目", "工程", "标段" },
            EndDPValue   = new string[] { }
        });
        ExtractDP.StartWithKey(KeyList, Dplist);
        foreach (var item in ExtractDP.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength)
            {
                continue;
            }
            if (ProjectName.Length <= 4)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        return(String.Empty);
    }
Example #10
0
    /// <summary>
    /// 获得甲方
    /// </summary>
    /// <returns></returns>
    public string GetJiaFang()
    {
        //最高置信度的抽取
        EntityProperty e = new EntityProperty();

        e.ExcludeContainsWordList = new string[] { "招标代理" };
        e.LeadingColonKeyWordList = new string[] {
            "甲方:", "合同买方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:"
        };
        e.CandidatePreprocess = (x =>
        {
            x = Normalizer.ClearTrailing(x);
            return(CompanyNameLogic.AfterProcessFullName(x).secFullName);
        });
        e.MaxLength = ContractTraning.MaxJiaFangLength;
        e.MaxLengthCheckPreprocess = EntityWordAnlayzeTool.TrimEnglish;
        e.MinLength = 3;
        e.Extract(this);

        //这里不直接做Distinct,出现频次越高,则可信度越高
        //多个甲方的时候,可能意味着没有甲方!
        if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1)
        {
            foreach (var candidate in e.LeadingColonKeyWordCandidate)
            {
                Program.Logger.WriteLine("发现多个甲方:" + candidate);
            }
        }
        if (e.LeadingColonKeyWordCandidate.Count > 0)
        {
            return(e.LeadingColonKeyWordCandidate[0]);
        }


        //招标
        var Extractor     = new ExtractPropertyByHTML();
        var CandidateWord = new List <String>();
        var StartArray    = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray      = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.MaxJiaFangLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }

        //合同
        Extractor  = new ExtractPropertyByHTML();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            if (EntityWordAnlayzeTool.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.MaxJiaFangLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }
        return(CompanyNameLogic.MostLikeCompanyName(CandidateWord));
    }
Example #11
0
    /// <summary>
    /// 获得合同名
    /// </summary>
    /// <returns></returns>
    string GetContractName()
    {
        var e = new EntityProperty();

        e.PropertyName = "合同名称";
        e.PropertyType = EntityProperty.enmType.NER;
        e.MaxLength    = ContractTraning.MaxContractNameLength;
        e.MinLength    = 5;

        /* 训练模式下
         * e.LeadingColonKeyWordList = ContractTraning.ContractNameLeadingDict
         *                          .Where((x) => { return x.Value >= 40; })    //阈值40%以上
         *                          .Select((x) => { return x.Key + ":"; }).ToArray();
         */
        e.LeadingColonKeyWordList   = new string[] { "合同名称:" };
        e.QuotationTrailingWordList = new string[] { "协议书", "合同书", "确认书", "合同", "协议" };
        e.QuotationTrailingWordList_IsSkipBracket = true;   //暂时只能选True
        var KeyList = new List <ExtractPropertyByDP.DPKeyWord>();

        KeyList.Add(new ExtractPropertyByDP.DPKeyWord()
        {
            StartWord    = new string[] { "签署", "签订" }, //通过SRL训练获得
            StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 },
            EndWord      = new string[] { "补充协议", "合同书", "合同", "协议书", "协议", },
            EndDPValue   = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系, LTPTrainingDP.动宾关系, LTPTrainingDP.主谓关系 }
        });
        e.DpKeyWordList = KeyList;

        var StartArray = new string[] { "签署了", "签订了" };   //通过语境训练获得
        var EndArray   = new string[] { "合同" };

        e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        e.ExternalStartEndStringFeatureCandidatePreprocess = (x) => { return(x + "合同"); };
        e.MaxLengthCheckPreprocess = str =>
        {
            return(EntityWordAnlayzeTool.TrimEnglish(str));
        };
        //最高级别的置信度,特殊处理器
        e.LeadingColonKeyWordCandidatePreprocess = str =>
        {
            var c = Normalizer.ClearTrailing(TrimJianCheng(str));
            return(c);
        };

        e.CandidatePreprocess = str =>
        {
            var c             = Normalizer.ClearTrailing(TrimJianCheng(str));
            var RightQMarkIdx = c.IndexOf("”");
            if (!(RightQMarkIdx != -1 && RightQMarkIdx != c.Length - 1))
            {
                //对于"XXX"合同,有右边引号,但不是最后的时候,不用做
                c = c.TrimStart("“".ToCharArray());
            }
            c = c.TrimStart("《".ToCharArray());
            c = c.TrimEnd("》".ToCharArray()).TrimEnd("”".ToCharArray());
            return(c);
        };
        e.ExcludeContainsWordList = new string[] { "日常经营重大合同" };
        //下面这个列表的根据不足
        e.ExcludeEqualsWordList = new string[] { "合同", "重大合同", "项目合同", "终止协议", "经营合同", "特别重大合同", "相关项目合同" };
        e.Extract(this);

        //是否所有的候选词里面包括(测试集无法使用)
        var contractlist = TraningDataset.ContractList.Where((x) => { return(x.id == this.Id); });

        if (contractlist.Count() > 0)
        {
            var contract     = contractlist.First();
            var contractname = contract.ContractName;
            if (!String.IsNullOrEmpty(contractname))
            {
                e.CheckIsCandidateContainsTarget(contractname);
            }
        }
        //置信度
        e.Confidence = ContractTraning.ContractES.GetStardardCI();
        return(e.EvaluateCI());
    }
Example #12
0
    static string GetContractName(MyRootHtmlNode root)
    {
        var Extractor   = new EntityProperty();
        var MarkFeature = new EntityProperty.struMarkFeature();

        MarkFeature.MarkStartWith = "《";
        MarkFeature.MarkEndWith   = "》";
        MarkFeature.InnerEndWith  = "合同";

        var MarkFeatureConfirm = new EntityProperty.struMarkFeature();

        MarkFeatureConfirm.MarkStartWith = "《";
        MarkFeatureConfirm.MarkEndWith   = "》";
        MarkFeatureConfirm.InnerEndWith  = "确认书";


        Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ContractName = item.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            Program.Logger.WriteLine("合同名称候补词(《XXX》):[" + item + "]");
            return(ContractName);
        }

        Extractor = new EntityProperty();
        //这些关键字后面
        Extractor.LeadingWordList = new string[] { "合同名称:" };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ContractName = item.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            Program.Logger.WriteLine("合同名称候补词(关键字):[" + item + "]");
            return(ContractName);
        }

        //合同
        Extractor = new EntityProperty();
        var StartArray = new string[] { "签署了" };
        var EndArray   = new string[] { "合同" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ContractName = item.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            Program.Logger.WriteLine("合同候补词(合同):[" + item + "]");
            return(ContractName);
        }
        return("");
    }
Example #13
0
    static string GetJiaFang(MyRootHtmlNode root)
    {
        var Extractor = new EntityProperty();

        //这些关键字后面
        Extractor.LeadingWordList = new string[] {
            "甲方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购人:", "采购人名称:", "采购方:"
        };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = AfterProcessJiaFang(item.Trim());
            if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength)
            {
                continue;
            }
            if (JiaFang.Length < 3)
            {
                continue;                         //使用实际长度排除全英文的情况
            }
            Program.Logger.WriteLine("甲方候补词(关键字):[" + JiaFang + "]");
            return(JiaFang);
        }

        //招标
        Extractor = new EntityProperty();
        var StartArray = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray   = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = AfterProcessJiaFang(item.Trim());
            JiaFang = JiaFang.Replace("业主", "").Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength)
            {
                continue;
            }
            if (JiaFang.Length < 3)
            {
                continue;                         //使用实际长度排除全英文的情况
            }
            Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang + "]");
            return(JiaFang);
        }

        //合同
        Extractor  = new EntityProperty();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = AfterProcessJiaFang(item.Trim());
            JiaFang = JiaFang.Replace("业主", "").Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength)
            {
                continue;
            }
            if (JiaFang.Length < 3)
            {
                continue;                         //使用实际长度排除全英文的情况
            }
            Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang + "]");
            return(JiaFang);
        }
        return("");
    }
Example #14
0
    //最大长度
    public static void TraningMaxLenth()
    {
        MaxJiaFangLength      = 0;
        MaxYiFangLength       = 0;
        MaxContractNameLength = 0;
        MaxProjectNameLength  = 0;
        foreach (var c in TraningDataset.ContractList)
        {
            var TEJiaFang = EntityWordAnlayzeTool.TrimEnglish(c.JiaFang);
            if (TEJiaFang.Length > MaxJiaFangLength)
            {
                MaxJiaFangLength = TEJiaFang.Length;
                MaxJiaFang       = TEJiaFang;
            }

            var TEYiFang = EntityWordAnlayzeTool.TrimEnglish(c.YiFang);
            if (TEYiFang.Length > MaxYiFangLength)
            {
                MaxYiFangLength = TEYiFang.Length;
                MaxYiFang       = TEYiFang;
            }

            var ContractList = c.ContractName.Split("、");
            foreach (var cn in ContractList)
            {
                var TEContractName = EntityWordAnlayzeTool.TrimEnglish(cn);
                if (TEContractName.Length > MaxContractNameLength)
                {
                    MaxContractNameLength = TEContractName.Length;
                    MaxContractName       = TEContractName;
                }
            }

            if (!string.IsNullOrEmpty(c.ContractMoneyUpLimit))
            {
                var m = 0.0;
                if (double.TryParse(c.ContractMoneyUpLimit, out m))
                {
                    if (m < MinAmount)
                    {
                        MinAmount = m;
                    }
                }
            }

            var ProjectNameList = c.ProjectName.Split("、");
            foreach (var jn in ProjectNameList)
            {
                if (jn.Contains(","))
                {
                    continue;
                }
                var TEProjectName = EntityWordAnlayzeTool.TrimEnglish(jn);
                if (TEProjectName.Length > MaxContractNameLength)
                {
                    MaxProjectNameLength = TEProjectName.Length;
                    MaxProjectName       = TEProjectName;
                }
            }
        }
        Program.Training.WriteLine("最大甲方(除去英语)长度:" + MaxJiaFangLength);
        Program.Training.WriteLine("最大甲方(除去英语):" + MaxJiaFang);
        Program.Training.WriteLine("最大乙方(除去英语)长度:" + MaxYiFangLength);
        Program.Training.WriteLine("最大乙方(除去英语):" + MaxYiFang);
        Program.Training.WriteLine("最大合同(除去英语)长度:" + MaxContractNameLength);
        Program.Training.WriteLine("最大合同(除去英语):" + MaxContractName);
        Program.Training.WriteLine("最大工程(除去英语)长度:" + MaxProjectNameLength);
        Program.Training.WriteLine("最大工程(除去英语):" + MaxProjectName);
        Program.Training.WriteLine("最小金额:" + MinAmount);
    }