示例#1
0
    public static List <String> AnlayzeNER(string xmlfilename)
    {
        //由于结果是多个XML构成的
        //1.掉所有的<?xml version="1.0" encoding="utf-8" ?>
        //2.加入<sentence></sentence> root节点
        var NerList = new List <String>();

        if (!File.Exists(xmlfilename))
        {
            return(NerList);
        }

        var sr = new StreamReader(xmlfilename);
        List <struWordNER> wl = null;
        var pl  = new List <List <struWordNER> >();
        var ner = String.Empty;

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine().Trim();
            if (line.StartsWith("<sent"))
            {
                if (wl != null)
                {
                    pl.Add(wl);
                }
                //一个新的句子
                wl = new List <struWordNER>();
            }
            if (line.StartsWith("<word"))
            {
                var word = new struWordNER(line);
                wl.Add(word);
                switch (word.ne)
                {
                case "B-Ni":
                    ner = word.cont;
                    break;

                case "I-Ni":
                    ner += word.cont;
                    break;

                case "E-Ni":
                    ner += word.cont;
                    NerList.Add(ner);
                    break;
                }
            }
        }
        if (wl != null)
        {
            pl.Add(wl);
        }
        sr.Close();
        return(NerList);
    }
示例#2
0
    public static List <List <struWordNER> > GetParagraghList(string xmlfilename)
    {
        var paragrapghList = new List <List <struWordNER> >();

        //由于结果是多个XML构成的
        //1.掉所有的<?xml version="1.0" encoding="utf-8" ?>
        //2.加入<sentence></sentence> root节点
        if (!File.Exists(xmlfilename))
        {
            return(paragrapghList);
        }
        List <struWordNER> wordList = null;
        var sr = new StreamReader(xmlfilename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine().Trim();
            if (line.StartsWith("<sent"))
            {
                if (wordList != null)
                {
                    paragrapghList.Add(wordList);
                }
                //一个新的句子
                wordList = new List <struWordNER>();
            }
            if (line.StartsWith("<word"))
            {
                var word = new struWordNER(line);
                wordList.Add(word);
            }
        }
        if (wordList != null)
        {
            paragrapghList.Add(wordList);
        }
        sr.Close();
        return(paragrapghList);
    }
示例#3
0
    /// <summary>
    /// 使用NerInfo抽取
    /// </summary>
    public static void GetCompanyNameByNerInfo(List <List <struWordNER> > paragragh)
    {
        var Rule1 = new NerExtract.NerExtractRule();

        Rule1.MaxWordLength = 10;
        Rule1.StartWord     = new List <struWordNER>();
        //首词NER属性
        var word = new struWordNER();

        word.pos = LTPTrainingNER.地名;  //只设定类型
        Rule1.StartWord.Add(word);
        //结束词NER属性
        Rule1.EndWord = new List <struWordNER>();
        word          = new struWordNER();
        word.cont     = "有限公司"; //只设定词语
        Rule1.EndWord.Add(word);

        var Rule2 = new NerExtract.NerExtractRule();

        Rule2.MaxWordLength = 10;
        Rule2.StartWord     = new List <struWordNER>();
        //首词NER属性
        word     = new struWordNER();
        word.pos = LTPTrainingNER.地名;  //只设定类型
        Rule2.StartWord.Add(word);
        //结束词NER属性
        Rule2.EndWord = new List <struWordNER>();
        word          = new struWordNER();
        word.cont     = "有限"; //只设定词语
        Rule2.EndWord.Add(word);
        word      = new struWordNER();
        word.cont = "责任";  //只设定词语
        Rule2.EndWord.Add(word);
        word      = new struWordNER();
        word.cont = "公司";  //只设定词语
        Rule2.EndWord.Add(word);

        var Rule3 = new NerExtract.NerExtractRule();

        Rule3.MaxWordLength = 10;
        Rule3.StartWord     = new List <struWordNER>();
        //首词NER属性
        word     = new struWordNER();
        word.pos = LTPTrainingNER.地名;  //只设定类型
        Rule3.StartWord.Add(word);
        //结束词NER属性
        Rule3.EndWord = new List <struWordNER>();
        word          = new struWordNER();
        word.cont     = "("; //只设定词语
        Rule3.EndWord.Add(word);
        word      = new struWordNER();
        word.cont = "有限";  //只设定词语
        Rule3.EndWord.Add(word);
        word      = new struWordNER();
        word.cont = "合伙";  //只设定词语
        Rule3.EndWord.Add(word);
        word      = new struWordNER();
        word.cont = ")";  //只设定词语
        Rule3.EndWord.Add(word);

        var company1 = NerExtract.Extract(Rule1, paragragh);
        var company2 = NerExtract.Extract(Rule2, paragragh);
        var company3 = NerExtract.Extract(Rule3, paragragh);
    }
示例#4
0
    public static List <String> GetProjectNameByNer(AnnouceDocument doc)
    {
        //由于结果是多个XML构成的
        //1.掉所有的<?xml version="1.0" encoding="utf-8" ?>
        //2.加入<sentence></sentence> root节点
        var ProjList = new List <String>();

        if (!File.Exists(doc.NerXMLFileName))
        {
            return(ProjList);
        }
        var sr = new StreamReader(doc.NerXMLFileName);
        List <struWordNER> wl = null;
        var pl  = new List <List <struWordNER> >();
        var ner = String.Empty;

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine().Trim();
            if (line.StartsWith("<sent"))
            {
                if (wl != null)
                {
                    pl.Add(wl);
                }
                //一个新的句子
                wl = new List <struWordNER>();
            }
            if (line.StartsWith("<word"))
            {
                var word = new struWordNER(line);
                wl.Add(word);
            }
        }
        if (wl != null)
        {
            pl.Add(wl);
        }
        sr.Close();
        var proj = String.Empty;

        foreach (var p in pl)
        {
            for (int ScanIdx = 0; ScanIdx < p.Count; ScanIdx++)
            {
                var word = p[ScanIdx];
                if (word.ne == "B-Ns" || word.ne == "S-Ns" ||
                    word.cont == "新建")
                {
                    //遇到地名开始或者单独地名,加入到项目字符中
                    if (!string.IsNullOrEmpty(proj) && proj.StartsWith("新建"))
                    {
                        proj += word.cont;
                    }
                    else
                    {
                        proj = word.cont;
                    }
                }
                else
                {
                    if (word.cont.Equals("项目") ||
                        word.cont.Equals("工程") ||
                        word.cont.Equals("标段") ||
                        word.cont.Equals("采购"))
                    {
                        if (!String.IsNullOrEmpty(proj))
                        {
                            proj += word.cont;
                            var FurtherTo = Math.Min(p.Count, ScanIdx + 5);
                            var ShardProj = proj;

                            //标段的后检索
                            if (word.cont == "标段")
                            {
                                //检查之后3个词汇的距离是否存在项目,工程,承包
                                for (int TrailingScanIdx = ScanIdx + 1;
                                     TrailingScanIdx < FurtherTo;
                                     TrailingScanIdx++)
                                {
                                    ShardProj += p[TrailingScanIdx].cont;
                                    if (p[TrailingScanIdx].cont == "项目" ||
                                        p[TrailingScanIdx].cont == "工程" ||
                                        p[TrailingScanIdx].cont == "承包")
                                    {
                                        proj = ShardProj;
                                        break;
                                    }
                                }
                            }

                            //工程
                            if (word.cont == "工程" || word.cont == "项目")
                            {
                                //检查之后3个词汇的距离是否存在项目,工程,承包
                                var isContranBrack = false;
                                for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo;
                                     TrailingScanIdx++)
                                {
                                    ShardProj += p[TrailingScanIdx].cont;
                                    if (p[TrailingScanIdx].cont.Trim() == "(")
                                    {
                                        isContranBrack = true;
                                    }
                                    if (p[TrailingScanIdx].cont.Trim() == ")")
                                    {
                                        isContranBrack = false;
                                    }
                                    if (p[TrailingScanIdx].cont == "标段")
                                    {
                                        ScanIdx = TrailingScanIdx;
                                        if (isContranBrack)
                                        {
                                            ShardProj += ")";
                                            ScanIdx++;
                                        }
                                        proj = ShardProj;
                                        break;
                                    }
                                }
                            }

                            //整体的再检查,是否下面一个单词还是工程,项目,标段
                            if (ScanIdx + 1 <= p.Count - 1)
                            {
                                if (p[ScanIdx + 1].cont == "工程" || p[ScanIdx + 1].cont == "项目" ||
                                    p[ScanIdx + 1].cont == "标段" || p[ScanIdx + 1].cont == "活动")
                                {
                                    proj += p[ScanIdx + 1].cont;
                                    ScanIdx++;
                                }
                            }

                            var isOK = true;
                            if (proj.Contains("重大工程"))
                            {
                                isOK = false;
                            }
                            if (proj.Length > 50)
                            {
                                isOK = false;
                            }
                            if (proj.Contains(";"))
                            {
                                isOK = false;
                            }
                            if (proj.Contains(""))
                            {
                                isOK = false;
                            }
                            if (isOK)
                            {
                                Console.WriteLine(doc.Id + " NER 发现工程:" + proj);
                                ProjList.Add(proj);
                            }
                            proj = string.Empty;
                        }
                    }
                    else
                    {
                        if (!String.IsNullOrEmpty(proj))
                        {
                            proj += word.cont;
                        }
                    }
                }
            }
        }
        return(ProjList.Distinct().ToList());
    }