Beispiel #1
0
    public void Extract(AnnouceDocument doc)
    {
        //纯关键字类型
        if (KeyWordMap.Count != 0)
        {
            var candidate = ExtractByKeyWordMap(doc.root);
            if (candidate.Count == 1)
            {
                WordMapResult = candidate.First();
            }
            if (candidate.Count > 1)
            {
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine("找到纯关键字类型两个关键字");
                }
            }
            return;
        }

        if (LeadingColonKeyWordList != null)
        {
            //按照规则,由固定先导词的,例如  [项目名:]
            //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器
            var ExtractorText = new ExtractPropertyByText();
            //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍
            ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList;
            ExtractorText.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                LeadingColonKeyWordCandidate.Add(PropertyValue);
            }

            var Extractor = new ExtractPropertyByHTML();
            Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
            Extractor.Extract(doc.root);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                //TEXT里面有的,这里不重复添加了
                if (!LeadingColonKeyWordCandidate.Contains(PropertyValue))
                {
                    LeadingColonKeyWordCandidate.Add(PropertyValue);
                }
            }
        }

        //书名号和引号
        if (QuotationTrailingWordList != null)
        {
            //接下来《》,“” 优先
            foreach (var bracket in doc.quotationList)
            {
                foreach (var word in QuotationTrailingWordList)
                {
                    if (bracket.Value.EndsWith(word))
                    {
                        var PropertyValue = CheckCandidate(bracket.Value);
                        if (String.IsNullOrEmpty(PropertyValue))
                        {
                            continue;
                        }
                        if (!Program.IsMultiThreadMode)
                        {
                            Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                        }
                        QuotationTrailingCandidate.Add(PropertyValue);
                    }
                }
            }
        }

        //句法依存
        if (DpKeyWordList != null)
        {
            var ExtractDP = new ExtractPropertyByDP();
            ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist);
            foreach (var item in ExtractDP.CandidateWord)
            {
                var PropertyValue = CheckCandidate(item.Value);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                DpKeyWordCandidate.Add(PropertyValue);
            }
        }

        if (ExternalStartEndStringFeature != null)
        {
            var ExtractorTEXT = new ExtractPropertyByText();
            ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorTEXT.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorTEXT.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
            }

            //一部分无法提取TEXT的情况
            var ExtractorHTML = new ExtractPropertyByHTML();
            ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorHTML.Extract(doc.root);
            foreach (var item in ExtractorHTML.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue))
                {
                    ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
                }
            }
        }
    }
Beispiel #2
0
    /// <summary>
    /// 获得工程名
    /// </summary>
    /// <returns></returns>
    string GetProjectName()
    {
        var ExtractorText = new ExtractPropertyByText();

        //这些关键字后面(最优先)
        ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" };
        ExtractorText.ExtractFromTextFile(TextFileName);
        foreach (var item in ExtractorText.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        var Extractor = new ExtractPropertyByHTML();

        Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (TrimJianCheng(ProjectName) == String.Empty)
            {
                continue;
            }
            ProjectName = TrimJianCheng(ProjectName);
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        foreach (var bracket in quotationList)
        {
            if (bracket.Value.EndsWith("工程") ||
                bracket.Value.EndsWith("标段"))
            {
                return(bracket.Value);
            }
        }

        var MarkFeature = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeature.MarkStartWith = "“";
        MarkFeature.MarkEndWith   = "”";
        MarkFeature.InnerEndWith  = "标段";

        var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature();

        MarkFeatureConfirm.MarkStartWith = "“";
        MarkFeatureConfirm.MarkEndWith   = "”";
        MarkFeatureConfirm.InnerEndWith  = "标";

        Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]");
            }
            return(ProjectName);
        }

        var ExtractDP = new ExtractPropertyByDP();
        var KeyList   = new List <ExtractPropertyByDP.DPKeyWord>();

        KeyList.Add(new ExtractPropertyByDP.DPKeyWord()
        {
            StartWord    = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" },
            StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 },
            EndWord      = new string[] { "采购", "项目", "工程", "标段" },
            EndDPValue   = new string[] { }
        });
        ExtractDP.StartWithKey(KeyList, Dplist);
        foreach (var item in ExtractDP.CandidateWord)
        {
            var ProjectName = item.Value.Trim();
            if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength)
            {
                continue;
            }
            if (ProjectName.Length <= 4)
            {
                continue;
            }
            if (!Program.IsMultiThreadMode)
            {
                Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]");
            }
            return(ProjectName);
        }

        return(String.Empty);
    }