Пример #1
0
        static void Main(string[] args)
        {
            Logger = new StreamWriter("Log.log");
            //全局编码
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

            //公司全称简称曾用名字典
            CompanyNameLogic.LoadCompanyName(@"Resources" + Path.DirectorySeparatorChar + "FDDC_announcements_company_name_20180531.json");
            //增减持公告日期的读入
            StockChange.ImportPublishTime();
            //结巴分词的地名修正词典
            PosNS.ImportNS(@"Resources" + Path.DirectorySeparatorChar + "ns.dict");
            CIRecord = new StreamWriter("CI.log");
            //预处理
            Traning(); return;

            Evaluator = new StreamWriter("Evaluator.log");
            Score     = new StreamWriter(@"Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt");
            //new Contract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1008828.html").Extract();return;
            Extract();
            CIRecord.Close();
            Score.Close();
            Evaluator.Close();
            Logger.Close();
        }
Пример #2
0
        static void Main(string[] args)
        {
            if (Environment.OSVersion.Platform == System.PlatformID.Unix)
            {
                //静态变量已经定下来了,这里改不来了!
                Console.WriteLine("Switch Doc Path To:" + DocBase);
            }
            //日志
            Logger = new StreamWriter("Log.log");
            //实体属性器日志设定
            EntityProperty.Logger = Logger;
            //全局编码
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

            CIRecord = new StreamWriter("CI.log");

            QuickTestArea(); return;

            //PDFToTXT.GetPdf2TxtBatchFile();

            //公司全称简称曾用名字典
            CompanyNameLogic.LoadCompanyName("Resources" + Path.DirectorySeparatorChar + "FDDC_announcements_company_name_20180531.json");
            //结巴分词的地名修正词典
            PosNS.ImportNS("Resources" + Path.DirectorySeparatorChar + "ns.dict");
            //预处理
            Traning();
            Evaluator = new StreamWriter("Evaluator.log");
            Score     = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt");
            Extract();
            CIRecord.Close();
            Score.Close();
            Evaluator.Close();
            Logger.Close();
        }
Пример #3
0
        /// <summary>
        /// 快速测试区
        /// </summary>
        private static void QuickTestArea()
        {
            var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml");

            CompanyNameLogic.GetCompanyNameByNerInfo(plst);
            return;

            var s0    = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权";
            var pos   = new PosSegmenter();
            var words = pos.Cut(s0);

            Evaluator = new StreamWriter("Evaluator.log");
            Score     = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt");
            //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt");
            //Score.Close();
            //Evaluator.Close();

            //TraningDataset.InitReorganization();
            ReOrganizationTraning.EvaluateMethodList = new string[] {
                "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法",
                "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法",
                "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法"
            }.ToList();
            var t = new Reorganization();

            t.Id           = "748379";
            t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html";
            //t.TextFileName = ContractPath_TEST + "/txt/128869.txt";
            //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml";
            t.Init();
            var recs = t.Extract();
            var s1   = recs[0].ConvertToString();
        }
Пример #4
0
    /// <summary>
    /// 公司名称的获得
    /// </summary>
    /// <param name="FullName"></param>
    /// <param name="ShortName"></param>
    /// <returns></returns>
    public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word)
    {
        if (String.IsNullOrEmpty(word))
        {
            return(String.Empty, String.Empty);
        }
        var fullname  = word.Replace(" ", String.Empty);
        var shortname = String.Empty;

        foreach (var companyname in doc.companynamelist)
        {
            if (companyname.secFullName == fullname)
            {
                //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性!
                if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName))
                {
                    shortname = companyname.secShortName;
                    break;
                }
            }
            if (companyname.secShortName == fullname)
            {
                fullname  = companyname.secFullName;
                shortname = companyname.secShortName;
                break;
            }
            //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下
            //简称和全称的关系
            if (companyname.secFullName.Contains(fullname) &&
                companyname.secFullName.Length > fullname.Length)
            {
                fullname  = companyname.secFullName;
                shortname = word;
            }
        }

        if (string.IsNullOrEmpty(shortname))
        {
            //字典
            shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName;
        }

        if (string.IsNullOrEmpty(shortname))
        {
            //在原文中寻找该字符名称,然后看一下,其后是否有【简称】字样,
            //简称后是否有引号字样“XXXX”有的话,差不多就是了
            shortname = GetShortNameByFullName(fullname, doc);
            if (!string.IsNullOrEmpty(shortname))
            {
                Console.WriteLine(fullname + ":" + shortname);
            }
        }

        return(fullname, shortname);
    }
Пример #5
0
    public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word)
    {
        if (String.IsNullOrEmpty(word))
        {
            return(String.Empty, String.Empty);
        }
        var fullname  = word.Replace(" ", String.Empty);
        var shortname = String.Empty;

        foreach (var companyname in doc.companynamelist)
        {
            if (companyname.secFullName == fullname)
            {
                //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性!
                if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName))
                {
                    shortname = companyname.secShortName;
                    break;
                }
            }
            if (companyname.secShortName == fullname)
            {
                fullname  = companyname.secFullName;
                shortname = companyname.secShortName;
                break;
            }
            //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下
            //简称和全称的关系
            if (companyname.secFullName.Contains(fullname) &&
                companyname.secFullName.Length > fullname.Length)
            {
                fullname  = companyname.secFullName;
                shortname = word;
            }
        }

        if (shortname == String.Empty)
        {
            shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName;
        }
        return(fullname, shortname);
    }
Пример #6
0
    ContractRec ExtractSingle()
    {
        contractType = String.Empty;
        foreach (var paragrah in root.Children)
        {
            foreach (var item in paragrah.Children)
            {
                if (item.Content.Contains("中标"))
                {
                    contractType = "中标";
                    break;
                }
                if (item.Content.Contains("合同"))
                {
                    contractType = "合同";
                    break;
                }
            }
            if (contractType != String.Empty)
            {
                break;
            }
        }

        if (contractType == String.Empty)
        {
            Console.WriteLine("contractType Null:" + Id);
        }

        var contract = new ContractRec();

        //公告ID
        contract.Id = Id;

        //乙方
        contract.YiFang = GetYiFang();
        if (contract.YiFang.Contains("本公司"))
        {
            contract.YiFang = string.Empty;
        }
        contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName;
        contract.YiFang = contract.YiFang.NormalizeTextResult();
        //按照规定除去括号
        contract.YiFang = RegularTool.TrimBrackets(contract.YiFang);
        if (contract.YiFang.Length < 3)
        {
            contract.YiFang = string.Empty;
        }


        //甲方
        contract.JiaFang = GetJiaFang(contract.YiFang);
        if (contract.JiaFang.Contains("本公司"))
        {
            contract.JiaFang = string.Empty;
        }
        contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName;
        contract.JiaFang = contract.JiaFang.NormalizeTextResult();
        if (contract.JiaFang.Contains("简称"))
        {
            contract.JiaFang = Utility.GetStringBefore(contract.JiaFang, "(");
        }
        //机构列表
        if (Nerlist != null)
        {
            var NiList = Nerlist.Where((n) => n.Type == LTPTrainingNER.enmNerType.Ni).Select((m) => m.RawData);
            if (!NiList.Contains(contract.JiaFang))
            {
                if (NiList.Contains("国家电网公司"))
                {
                    contract.JiaFang = "国家电网公司";
                }
            }
        }
        //项目
        contract.ProjectName = GetProjectName();
        contract.ProjectName = contract.ProjectName.NormalizeTextResult();
        if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”"))
        {
            contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray());
        }
        if (contract.ProjectName.EndsWith(",签约双方"))
        {
            contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方");
        }
        if (contract.ProjectName.Contains("(以下简称"))
        {
            contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称");
        }
        if (contract.ProjectName.EndsWith(")"))
        {
            if (contract.ProjectName.Contains("(招标编号"))
            {
                contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(招标编号");
            }
            if (contract.ProjectName.Contains("(合同编号"))
            {
                contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(合同编号");
            }
        }
        contract.ProjectName = contract.ProjectName.Replace("的推荐中标", "");
        //特殊处理
        contract.ProjectName = contract.ProjectName.Replace("<1>", "1、");
        contract.ProjectName = contract.ProjectName.Replace("“", "");
        contract.ProjectName = contract.ProjectName.Replace("”", "");

        //合同名
        contract.ContractName = GetContractName();
        if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”"))
        {
            contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray());
        }
        //去掉书名号
        contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty);
        contract.ContractName = contract.ContractName.NormalizeTextResult();
        if (contract.ContractName.Contains("(以下简称"))
        {
            contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称");
        }
        contract.ContractName = ExtendContractName(contract.ContractName);

        //如果是采购协议,则工程名清空
        if (contract.ContractName.Contains("采购"))
        {
            if (contract.ProjectName.Contains("标段"))
            {
                //TODO:
            }
            else
            {
                contract.ProjectName = string.Empty;
            }
        }

        //金额
        var money = GetMoney();

        contract.ContractMoneyUpLimit   = MoneyUtility.Format(money.MoneyAmount, String.Empty);
        contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit;

        //联合体
        contract.UnionMember = GetUnionMember(contract);
        contract.UnionMember = contract.UnionMember.NormalizeTextResult();
        //按照规定除去括号
        contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember);

        var YiFangArray = contract.YiFang.Split(Utility.SplitChar);

        if (YiFangArray.Length > 1)
        {
            contract.UnionMember = Utility.GetStringAfter(contract.YiFang, Utility.SplitChar);
            contract.YiFang      = YiFangArray[0];
            Console.WriteLine("联合体:" + contract.UnionMember);
        }
        return(contract);
    }
Пример #7
0
    /// <summary>
    /// 获得甲方
    /// </summary>
    /// <returns></returns>
    public string GetJiaFang()
    {
        //最高置信度的抽取
        EntityProperty e = new EntityProperty();

        e.ExcludeContainsWordList = new string[] { "招标代理" };
        e.LeadingColonKeyWordList = new string[] {
            "甲方:", "合同买方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:"
        };
        e.CandidatePreprocess = (x =>
        {
            x = Normalizer.ClearTrailing(x);
            return(CompanyNameLogic.AfterProcessFullName(x).secFullName);
        });
        e.MaxLength = ContractTraning.JiaFangES.MaxLength;
        e.MaxLengthCheckPreprocess = Utility.TrimEnglish;
        e.MinLength = 3;
        e.Extract(this);

        //这里不直接做Distinct,出现频次越高,则可信度越高
        //多个甲方的时候,可能意味着没有甲方!
        if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1)
        {
            foreach (var candidate in e.LeadingColonKeyWordCandidate)
            {
                Program.Logger.WriteLine("发现多个甲方:" + candidate);
            }
        }
        if (e.LeadingColonKeyWordCandidate.Count > 0)
        {
            return(e.LeadingColonKeyWordCandidate[0]);
        }


        //招标
        var Extractor     = new ExtractPropertyByHTML();
        var CandidateWord = new List <String>();
        var StartArray    = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray      = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim();
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }

        //合同
        Extractor  = new ExtractPropertyByHTML();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }
        return(CompanyNameLogic.MostLikeCompanyName(CandidateWord));
    }
Пример #8
0
    public static struCompanyName AfterProcessFullName(string FullName)
    {
        var ShortName = String.Empty;
        var CompanyNameTrailingwords = new string[] {
            "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称"
        };

        //暂时不做括号的正规化
        foreach (var trailing in CompanyNameTrailingwords)
        {
            if (FullName.Contains(trailing))
            {
                //获取简称
                var BracketsList = RegularTool.GetChineseBrackets(FullName);
                foreach (var bracketItem in BracketsList)
                {
                    var ShortNameList = RegularTool.GetChineseQuotation(bracketItem);
                    if (ShortNameList.Count > 0)
                    {
                        ShortName = ShortNameList.First();
                        if (!String.IsNullOrEmpty(ShortName))
                        {
                            ShortName = ShortName.Substring(1, ShortName.Length - 2);
                        }
                    }
                }
                FullName = Utility.GetStringBefore(FullName, trailing);
            }
        }
        if (FullName.Contains("及其"))
        {
            FullName = Utility.GetStringBefore(FullName, "及其");
        }
        if (FullName.Contains("股东"))
        {
            FullName = Utility.GetStringAfter(FullName, "股东");
        }
        if (FullName.Contains("一致行动人"))
        {
            FullName = Utility.GetStringAfter(FullName, "一致行动人");
        }
        if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName))
        {
            FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName;
        }
        //删除前导
        FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName);
        FullName = CutOtherLeadingWords(FullName);
        if (ShortName != String.Empty)
        {
            return(new struCompanyName()
            {
                secFullName = FullName, secShortName = ShortName, Score = 80
            });
        }
        else
        {
            return(new struCompanyName()
            {
                secFullName = FullName, Score = 60
            });
        }
    }
Пример #9
0
    struContract ExtractSingle(MyRootHtmlNode root, String Id)
    {
        contractType = String.Empty;
        foreach (var paragrah in root.Children)
        {
            foreach (var item in paragrah.Children)
            {
                if (item.Content.Contains("中标"))
                {
                    contractType = "中标";
                    break;
                }
                if (item.Content.Contains("合同"))
                {
                    contractType = "合同";
                    break;
                }
            }
            if (contractType != String.Empty)
            {
                break;
            }
        }

        if (contractType == String.Empty)
        {
            Console.WriteLine("contractType Null:" + Id);
        }

        var contract = new struContract();

        //公告ID
        contract.id = Id;
        //甲方
        contract.JiaFang = GetJiaFang();
        contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName;
        contract.JiaFang = contract.JiaFang.NormalizeTextResult();
        if (!Nerlist.Contains(contract.JiaFang))
        {
            //作为特殊单位,国家电网公司一般都是甲方
            if (Nerlist.Contains("国家电网公司"))
            {
                contract.JiaFang = "国家电网公司";
            }
        }

        //乙方
        contract.YiFang = GetYiFang();
        contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName;
        contract.YiFang = contract.YiFang.NormalizeTextResult();
        //按照规定除去括号
        contract.YiFang = RegularTool.TrimBrackets(contract.YiFang);


        //项目
        contract.ProjectName = GetProjectName();
        if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”"))
        {
            contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray());
        }
        if (contract.ProjectName.EndsWith(",签约双方"))
        {
            contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方");
        }
        if (contract.ProjectName.Contains("(以下简称"))
        {
            contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称");
        }
        contract.ProjectName = contract.ProjectName.NormalizeTextResult();

        //合同
        if (contractType == "中标")
        {
            //按照数据分析来看,应该工程名 在中标的时候填写,合同名在合同的时候填写
            contract.ContractName = String.Empty;
        }
        else
        {
            contract.ContractName = GetContractName();
            if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”"))
            {
                contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray());
            }
            //去掉书名号
            contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty);
            if (contract.ContractName.Contains("(以下简称"))
            {
                contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称");
            }
            contract.ContractName = contract.ContractName.NormalizeTextResult();
        }


        //金额
        var money = GetMoney();

        contract.ContractMoneyUpLimit   = MoneyUtility.Format(money.MoneyAmount, String.Empty);
        contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit;

        //联合体
        contract.UnionMember = GetUnionMember(contract.JiaFang, contract.YiFang);
        contract.UnionMember = contract.UnionMember.NormalizeTextResult();
        //按照规定除去括号
        contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember);
        return(contract);
    }
Пример #10
0
    /// <summary>
    /// 根据表头标题抽取
    /// </summary>
    /// <param name="root"></param>
    /// <param name="id"></param>
    /// <returns></returns>
    List <RecordBase> ExtractFromTable()
    {
        var StockHolderRule = new TableSearchTitleRule();

        StockHolderRule.Name      = "股东全称";
        StockHolderRule.Title     = new string[] { "股东名称", "名称", "增持主体", "增持人", "减持主体", "减持人", "姓名" }.ToList();
        StockHolderRule.IsTitleEq = true;
        StockHolderRule.IsRequire = true;

        var ChangeDateRule = new TableSearchTitleRule();

        ChangeDateRule.Name  = "变动截止日期";
        ChangeDateRule.Title = new string[] { "买卖时间", "日期", "减持期间", "增持期间", "减持股份期间", "增持股份期间",
                                              "减持时间", "增持时间", "减持股份时间", "增持股份时间", "买入时间", "卖出时间" }.ToList();
        ChangeDateRule.IsTitleEq = false;
        ChangeDateRule.Normalize = NormailizeEndChangeDate;


        var ChangePriceRule = new TableSearchTitleRule();

        ChangePriceRule.Name      = "变动价格";
        ChangePriceRule.Title     = new string[] { "买入均价", "卖出均价", "成交均价", "减持价格", "增持价格", "减持股均价", "增持股均价", "减持均", "增持均", "价格区间" }.ToList();
        ChangePriceRule.IsTitleEq = false;
        ChangePriceRule.Normalize = (x, y) =>
        {
            var prices = RegularTool.GetRegular(x, RegularTool.MoneyExpress);
            if (prices.Count == 0)
            {
                if (x.Contains("元"))
                {
                    return(Utility.GetStringBefore(x, "元"));
                }
            }
            else
            {
                //增减持,区间的情况,取最高价,假设最后一个数字是最大的
                return(prices.Last().RawData);
            }
            return(x);
        };

        var ChangeNumberRule = new TableSearchTitleRule();

        ChangeNumberRule.Name      = "变动数量";
        ChangeNumberRule.Title     = new string[] { "成交数量", "减持股数", "增持股数", "减持数量", "增持数量", "买入股份数", "卖出股份数", "股数" }.ToList();
        ChangeNumberRule.IsTitleEq = false;
        ChangeNumberRule.Normalize = NumberUtility.NormalizerStockNumber;


        var Rules = new List <TableSearchTitleRule>();

        Rules.Add(StockHolderRule);
        Rules.Add(ChangeDateRule);
        Rules.Add(ChangePriceRule);
        Rules.Add(ChangeNumberRule);

        var result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false);

        if (result.Count == 0)
        {
            //没有抽取到任何数据
            Rules.Clear();
            ChangeDateRule.IsRequire = true;
            Rules.Add(ChangeDateRule);
            Rules.Add(ChangePriceRule);
            Rules.Add(ChangeNumberRule);
            result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false);
            if (result.Count == 0)
            {
                return(new List <RecordBase>());
            }
            var NewResult = new List <CellInfo[]>();
            var Name      = GetHolderName();
            if (String.IsNullOrEmpty(Name.FullName) && String.IsNullOrEmpty(Name.ShortName))
            {
                return(new List <RecordBase>());
            }
            foreach (var item in result)
            {
                NewResult.Add(new CellInfo[]
                              { new CellInfo()
                                {
                                    RawData = String.IsNullOrEmpty(Name.FullName)?Name.ShortName:Name.FullName
                                }, item[0], item[1], item[2] });
            }
            result = NewResult;
        }

        var holderafterlist = GetHolderAfter();

        var stockchangelist = new List <RecordBase>();

        foreach (var rec in result)
        {
            var stockchange = new StockChangeRec();
            stockchange.Id = Id;

            var ModifyName = rec[0].RawData;
            //表格里面长的名字可能被分页切割掉
            //这里使用合计表进行验证
            if (!holderafterlist.Select((z) => { return(z.Name); }).ToList().Contains(ModifyName))
            {
                foreach (var item in holderafterlist)
                {
                    if (item.Name.EndsWith("先生"))
                    {
                        break;                            //特殊处理,没有逻辑可言
                    }
                    if (item.Name.StartsWith(ModifyName) && !item.Name.Equals(ModifyName))
                    {
                        ModifyName = item.Name;
                        break;
                    }
                    if (item.Name.EndsWith(ModifyName) && !item.Name.Equals(ModifyName))
                    {
                        ModifyName = item.Name;
                        break;
                    }
                }
            }


            var Name = CompanyNameLogic.NormalizeCompanyName(this, ModifyName);
            stockchange.HolderFullName  = Name.FullName.NormalizeTextResult();
            stockchange.HolderShortName = Name.ShortName;

            if (stockchange.HolderFullName.Contains("简称"))
            {
                stockchange.HolderShortName = Utility.GetStringAfter(stockchange.HolderFullName, "简称");
                stockchange.HolderShortName = stockchange.HolderShortName.Replace(")", String.Empty).Replace("“", String.Empty).Replace("”", String.Empty);
                stockchange.HolderFullName  = Utility.GetStringBefore(stockchange.HolderFullName, "(");
            }

            stockchange.ChangeEndDate = rec[1].RawData;

            DateTime x;
            if (!DateTime.TryParse(stockchange.ChangeEndDate, out x))
            {
                //无法处理的情况
                if (!Program.IsDebugMode)
                {
                    //非调试模式
                    stockchange.ChangeEndDate = String.Empty;
                }
            }

            if (!String.IsNullOrEmpty(rec[2].RawData))
            {
                //股价区间化的去除
                if (!(rec[2].RawData.Contains("-") || rec[2].RawData.Contains("~") || rec[2].RawData.Contains("至")))
                {
                    stockchange.ChangePrice = rec[2].RawData.Replace(" ", String.Empty);
                    stockchange.ChangePrice = stockchange.ChangePrice.Replace("*", "");
                    stockchange.ChangePrice = stockchange.ChangePrice.NormalizeNumberResult();
                }
            }
            if (!RegularTool.IsUnsign(stockchange.ChangePrice))
            {
                if (!String.IsNullOrEmpty(stockchange.ChangePrice))
                {
                    Console.WriteLine("Error ChangePrice:[" + stockchange.ChangePrice + "]");
                }
                stockchange.ChangePrice = String.Empty;
            }


            if (!String.IsNullOrEmpty(rec[3].RawData))
            {
                stockchange.ChangeNumber = rec[3].RawData.Replace(" ", String.Empty);
                stockchange.ChangeNumber = stockchange.ChangeNumber.NormalizeNumberResult();
                if (!RegularTool.IsUnsign(stockchange.ChangeNumber))
                {
                    if (!String.IsNullOrEmpty(stockchange.ChangeNumber))
                    {
                        Console.WriteLine("Error ChangeNumber:[" + stockchange.ChangeNumber + "]");
                    }
                    stockchange.ChangeNumber = String.Empty;
                }
            }

            //基本上所有的有效记录都有股东名和截至日期,所以,这里这么做,可能对于极少数没有截至日期的数据有伤害,但是对于整体指标来说是好的
            if (string.IsNullOrEmpty(stockchange.HolderFullName) || string.IsNullOrEmpty(stockchange.ChangeEndDate))
            {
                continue;
            }
            if (stockchange.ChangeNumber == "0" || stockchange.ChangePrice == "0")
            {
                continue;
            }
            stockchangelist.Add(stockchange);
        }


        //寻找所有的股东全称
        var namelist = stockchangelist.Select(x => ((StockChangeRec)x).HolderFullName).Distinct().ToList();
        var newRec   = new List <StockChangeRec>();

        foreach (var name in namelist)
        {
            var stocklist = stockchangelist.Where((x) => { return(((StockChangeRec)x).HolderFullName == name); }).ToList();
            stocklist.Sort((x, y) => { return(((StockChangeRec)x).ChangeEndDate.CompareTo(((StockChangeRec)x).ChangeEndDate)); });
            var last = (StockChangeRec)stocklist.Last();
            for (int i = 0; i < holderafterlist.Count; i++)
            {
                var after = holderafterlist[i];
                after.Name = after.Name.Replace(" ", "");
                if (after.Name == last.HolderFullName || after.Name == last.HolderShortName)
                {
                    stockchangelist.Remove(last);   //结构体,无法直接修改!!使用删除,增加的方法
                    last.HoldNumberAfterChange  = after.Count;
                    last.HoldPercentAfterChange = after.Percent;
                    newRec.Add(last);
                }
            }
        }

        if (holderafterlist.Count != namelist.Count)
        {
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("增持者数量确认!");
            }
        }

        stockchangelist.AddRange(newRec);
        return(stockchangelist);
    }
Пример #11
0
    /// <summary>
    /// 获得甲方
    /// </summary>
    /// <returns></returns>
    string GetJiaFang(String YiFang)
    {
        //最高置信度的抽取
        EntityProperty e = new EntityProperty();

        e.ExcludeContainsWordList = new string[] { "招标代理" };
        e.LeadingColonKeyWordList = new string[] {
            "甲方:", "合同买方:",
            "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:",
            "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "项目招标人:",
            "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:",
            "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:"
        };
        e.CandidatePreprocess = (x =>
        {
            x = Normalizer.ClearTrailing(x);
            return(CompanyNameLogic.AfterProcessFullName(x).secFullName);
        });
        e.MaxLength = 32;
        e.MaxLengthCheckPreprocess = Utility.TrimEnglish;
        e.MinLength = 3;
        e.Extract(this);
        //这里不直接做Distinct,出现频次越高,则可信度越高
        //多个甲方的时候,可能意味着没有甲方!
        if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1)
        {
            foreach (var candidate in e.LeadingColonKeyWordCandidate)
            {
                Program.Logger.WriteLine("发现多个甲方:" + candidate);
            }
        }
        if (e.LeadingColonKeyWordCandidate.Count > 0)
        {
            return(e.LeadingColonKeyWordCandidate[0]);
        }

        var ner    = SearchJiaFang();
        var NerJia = String.Empty;

        if (!String.IsNullOrEmpty(ner))
        {
            foreach (var cn in companynamelist)
            {
                if (cn.secShortName == ner)
                {
                    ner = cn.secFullName;
                }
            }
            if (String.IsNullOrEmpty(YiFang))
            {
                NerJia = ner;
            }
            if (!YiFang.Equals(ner))
            {
                NerJia = ner;
            }
        }

        //招标
        var Extractor     = new ExtractPropertyByHTML();
        var CandidateWord = new List <String>();
        var StartArray    = new string[] { "招标单位", "业主", "收到", "接到" };
        var EndArray      = new string[] { "发来", "发出", "的中标" };

        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim();
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }

        //合同
        Extractor  = new ExtractPropertyByHTML();
        StartArray = new string[] { "与", "与业主" };
        EndArray   = new string[] { "签署", "签订" };
        Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray);
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim());
            JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim();
            if (JiaFang.secFullName.Contains("招标代理"))
            {
                continue;                                       //特殊业务规则
            }
            if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32)
            {
                continue;
            }
            if (JiaFang.secFullName.Length < 3)
            {
                continue;                                     //使用实际长度排除全英文的情况
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]");
            }
            CandidateWord.Add(JiaFang.secFullName);
        }
        if (!String.IsNullOrEmpty(NerJia))
        {
            //原则上,有NER中提取的甲方,则使用甲方
            foreach (var c in CandidateWord)
            {
                //但是,这里有可能是正确的解答,例如
                //NER:(集团)有限公司 实际上应该是 XXXX(集团)有限公司
                if (c.EndsWith(NerJia))
                {
                    return(c);
                }
            }
            return(NerJia);
        }
        else
        {
            return(CompanyNameLogic.MostLikeCompanyName(CandidateWord));
        }
    }