示例#1
0
    public static string GetShortNameByFullName(String FullName, AnnouceDocument doc)
    {
        if (FullName.Length <= 4)
        {
            return(string.Empty);                      //名称或者已经是简称的场合,退出
        }
        var quotationList = LocateProperty.LocateQuotation(doc.root, false);
        var fullnamelist  = LocateProperty.LocateCustomerWord(doc.root, new string[] { FullName }.ToList());
        var jianchenglist = LocateProperty.LocateCustomerWord(doc.root, new string[] { "简称" }.ToList());

        foreach (var fn in fullnamelist)
        {
            var ql = quotationList.Where((x) =>
            {
                return(x.Loc == fn.Loc && x.Description == "引号" && x.StartIdx > fn.StartIdx);
            });
            foreach (var shrotmane in ql)
            {
                foreach (var jc in jianchenglist)
                {
                    if (jc.Loc == fn.Loc && jc.StartIdx > fn.StartIdx &&
                        jc.StartIdx < shrotmane.StartIdx &&
                        (shrotmane.StartIdx - jc.StartIdx) <= 4)
                    {
                        if (shrotmane.Value.Length < FullName.Length)
                        {
                            return(shrotmane.Value);
                        }
                    }
                }
            }
        }
        return(string.Empty);
    }
示例#2
0
文件: NerMap.cs 项目: lxxwin/FDDC
    /// <summary>
    /// 实体分析
    /// </summary>
    /// <param name="doc"></param>
    public void Anlayze(AnnouceDocument doc)
    {
        var nerlist = new List <LocAndValue <String> >();

        if (doc.Nerlist != null)
        {
            var nh = doc.Nerlist.Where(x => x.Type == enmNerType.Nh).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, nh.ToList(), "人名"));

            var ni = doc.Nerlist.Where(x => x.Type == enmNerType.Ni).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, ni.ToList(), "机构"));

            var ns = doc.Nerlist.Where(x => x.Type == enmNerType.Ns).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, ns.ToList(), "地名"));
        }

        foreach (var paragragh in doc.root.Children)
        {
            foreach (var s in paragragh.Children)
            {
                var p = LocateParagraphInfo(doc, s.PositionId, nerlist);
                if (p.NerList.Count + p.moneylist.Count + p.datelist.Count != 0)
                {
                    if (!ParagraghlocateDict.ContainsKey(s.PositionId))
                    {
                        ParagraghlocateDict.Add(s.PositionId, p);
                    }
                }
            }
        }
    }
示例#3
0
    /// <summary>
    /// 合并表
    /// </summary>
    /// <param name="doc"></param>
    /// <param name="NextTableId"></param>
    public static void MergeTable(AnnouceDocument doc, int NextTableId)
    {
        var table = new HTMLTable(doc.root.TableList[NextTableId - 1]);

        string[] pos;
        string[] tablerec;
        string   value;
        var      offset = table.RowCount;

        //修改第二张表格的数据
        foreach (var Nextitem in doc.root.TableList[NextTableId])
        {
            tablerec = Nextitem.Split("|");
            pos      = tablerec[0].Split(",");
            value    = tablerec[1];
            var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value;
            doc.root.TableList[NextTableId - 1].Add(newtablerec);
        }
        doc.root.TableList[NextTableId].Clear();
        for (int i = 0; i < doc.root.Children.Count; i++)
        {
            for (int j = 0; j < doc.root.Children[i].Children.Count; j++)
            {
                var node = doc.root.Children[i].Children[j];
                if (node.TableId == NextTableId)
                {
                    node.TableId = -1;
                }
            }
        }
    }
示例#4
0
    public static void FixNullValue(MyRootHtmlNode root, AnnouceDocument doc)
    {
        var CompanyFullNameList  = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList();
        var CompanyShortNameList = doc.companynamelist.Select((x) => { return(x.secShortName); }).Distinct().ToList();

        for (int tableId = 1; tableId <= root.TableList.Count; tableId++)
        {
            var table = root.TableList[tableId];
            for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++)
            {
                var tablerec = table[checkItemIdx].Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1].Replace(" ", "");
                var col      = int.Parse(pos[2]);
                if (CompanyFullNameList.Contains(value) || CompanyShortNameList.Contains(value))
                {
                    for (int fixIdx = 0; fixIdx < table.Count; fixIdx++)
                    {
                        var nullvalue = table[fixIdx].Split("|")[1];
                        var nullcol   = int.Parse(table[fixIdx].Split("|")[0].Split(",")[2]);
                        if (nullvalue.Equals(strNullValue) && col == nullcol)
                        {
                            table[fixIdx] = table[fixIdx].Split("|")[0] + "|" + value;
                        }
                    }
                }
            }
        }

        for (int tableId = 1; tableId <= root.TableList.Count; tableId++)
        {
            var table = root.TableList[tableId];
            for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++)
            {
                var tablerec = table[checkItemIdx].Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1].Replace(" ", "");
                var row      = int.Parse(pos[1]);
                var col      = int.Parse(pos[2]);
                if (value == strNullValue && row != 1)
                {
                    //上一行是RowSpan,或者下一行是RowSpan,则这行也是RowSpan
                    var pre = tableId.ToString() + "," + (row - 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue;
                    if (table.Contains(pre))
                    {
                        table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue;
                    }
                    else
                    {
                        var next = tableId.ToString() + "," + (row + 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue;
                        if (table.Contains(next))
                        {
                            table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue;
                        }
                    }
                }
            }
        }
    }
示例#5
0
    /// <summary>
    /// 公司名称的获得
    /// </summary>
    /// <param name="FullName"></param>
    /// <param name="ShortName"></param>
    /// <returns></returns>
    public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word)
    {
        if (String.IsNullOrEmpty(word))
        {
            return(String.Empty, String.Empty);
        }
        var fullname  = word.Replace(" ", String.Empty);
        var shortname = String.Empty;

        foreach (var companyname in doc.companynamelist)
        {
            if (companyname.secFullName == fullname)
            {
                //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性!
                if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName))
                {
                    shortname = companyname.secShortName;
                    break;
                }
            }
            if (companyname.secShortName == fullname)
            {
                fullname  = companyname.secFullName;
                shortname = companyname.secShortName;
                break;
            }
            //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下
            //简称和全称的关系
            if (companyname.secFullName.Contains(fullname) &&
                companyname.secFullName.Length > fullname.Length)
            {
                fullname  = companyname.secFullName;
                shortname = word;
            }
        }

        if (string.IsNullOrEmpty(shortname))
        {
            //字典
            shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName;
        }

        if (string.IsNullOrEmpty(shortname))
        {
            //在原文中寻找该字符名称,然后看一下,其后是否有【简称】字样,
            //简称后是否有引号字样“XXXX”有的话,差不多就是了
            shortname = GetShortNameByFullName(fullname, doc);
            if (!string.IsNullOrEmpty(shortname))
            {
                Console.WriteLine(fullname + ":" + shortname);
            }
        }

        return(fullname, shortname);
    }
示例#6
0
 /// <summary>
 /// 单行合并
 /// </summary>
 /// <param name="doc"></param>
 private static void OneRowFix(AnnouceDocument doc)
 {
     for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++)
     {
         var table     = new HTMLTable(doc.root.TableList[NextTableId - 1]);
         var nexttable = new HTMLTable(doc.root.TableList[NextTableId]);
         if (table.RowCount == 1 && table.ColumnCount == nexttable.ColumnCount)
         {
             MergeTable(doc, NextTableId);
         }
     }
 }
示例#7
0
    public static void AnlayzeEntitySurroundWords()
    {
        var         ContractPath_TRAIN      = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\重大合同";
        Surround    JiaFangSurround         = new Surround();
        Surround    YiFangSurround          = new Surround();
        Surround    ProjectNameSurround     = new Surround();
        Surround    ContractNameSurround    = new Surround();
        LeadingWord JiaFangNameLeadingWord  = new LeadingWord();
        LeadingWord YiFangNameLeadingWord   = new LeadingWord();
        LeadingWord ProjectNameLeadingWord  = new LeadingWord();
        LeadingWord ContractNameLeadingWord = new LeadingWord();

        foreach (var filename in System.IO.Directory.GetFiles(ContractPath_TRAIN + @"\html\"))
        {
            var fi = new System.IO.FileInfo(filename);
            var Id = fi.Name.Replace(".html", String.Empty);
            if (TraningDataset.GetContractById(Id).Count == 0)
            {
                continue;
            }
            var contract = TraningDataset.GetContractById(Id).First();
            var doc      = new AnnouceDocument(filename);
            //if (!string.IsNullOrEmpty(contract.JiaFang)) JiaFangSurround.AnlayzeEntitySurroundWords(doc, contract.JiaFang);
            //if (!string.IsNullOrEmpty(contract.YiFang)) YiFangSurround.AnlayzeEntitySurroundWords(doc, contract.YiFang);
            //if (!string.IsNullOrEmpty(contract.ProjectName)) ProjectNameSurround.AnlayzeEntitySurroundWords(doc, contract.ProjectName);
            //if (!string.IsNullOrEmpty(contract.ContractName)) ContractNameSurround.AnlayzeEntitySurroundWords(doc, contract.ContractName);
            if (!string.IsNullOrEmpty(contract.JiaFang))
            {
                JiaFangNameLeadingWord.AnlayzeLeadingWord(doc, contract.JiaFang);
            }
            if (!string.IsNullOrEmpty(contract.YiFang))
            {
                YiFangNameLeadingWord.AnlayzeLeadingWord(doc, contract.YiFang);
            }
            if (!string.IsNullOrEmpty(contract.ProjectName))
            {
                ProjectNameLeadingWord.AnlayzeLeadingWord(doc, contract.ProjectName);
            }
            if (!string.IsNullOrEmpty(contract.ContractName))
            {
                ContractNameLeadingWord.AnlayzeLeadingWord(doc, contract.ContractName);
            }
        }
        //JiaFangSurround.GetTop(10);
        //YiFangSurround.GetTop(10);
        //ProjectNameSurround.GetTop(10);
        //ContractNameSurround.GetTop(10);

        JiaFangLeadingDict      = JiaFangNameLeadingWord.GetTop(5);
        YiFangLeadingDict       = YiFangNameLeadingWord.GetTop(5);
        ProjectNameLeadingDict  = ProjectNameLeadingWord.GetTop(5);
        ContractNameLeadingDict = ContractNameLeadingWord.GetTop(5);
    }
示例#8
0
    /// <summary>
    /// 所有可能出现的 XXX:形式的前导词列表
    /// </summary>
    public void AnlayzeLeadingWord(AnnouceDocument doc, String searchKey)
    {
        if (!File.Exists(doc.TextFileName))
        {
            return;
        }
        var SR = new StreamReader(doc.TextFileName);

        while (!SR.EndOfStream)
        {
            var line = SR.ReadLine();
            var idx  = line.IndexOf(":");
            if (idx != -1)
            {
                var LeadingWord = line.Substring(0, idx);
                var keyword     = line.Substring(idx + 1);
                keyword = keyword.Trim();
                if (!keyword.NormalizeTextResult().Equals(searchKey.NormalizeTextResult()))
                {
                    continue;
                }
                var leadwords = pos.Cut(LeadingWord);
                LeadingWord = "";
                //去除(一)合同名称 2、备查文件
                foreach (var word in leadwords)
                {
                    if (word.Flag == LTPTrainingNER.词性标点 || word.Flag == LTPTrainingNER.数词)
                    {
                        LeadingWord = "";
                    }
                    else
                    {
                        LeadingWord += word.Word;
                    }
                }
                LeadingWord = LeadingWord.Trim();
                if (String.IsNullOrEmpty(LeadingWord))
                {
                    continue;
                }
                if (LeadingWordDict.ContainsKey(LeadingWord))
                {
                    LeadingWordDict[LeadingWord] = LeadingWordDict[LeadingWord] + 1;
                }
                else
                {
                    LeadingWordDict.Add(LeadingWord, 1);
                }
            }
        }
    }
示例#9
0
    /// <summary>
    /// 首行NULL的合并
    /// </summary>
    /// <param name="doc"></param>
    private static void FirstRowNullFix(AnnouceDocument doc)
    {
        for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++)
        {
            foreach (var item in doc.root.TableList[NextTableId])
            {
                var FirstTablePos  = -1;
                var SecondTablePos = -1;
                foreach (var p in doc.root.Children)
                {
                    foreach (var s in p.Children)
                    {
                        if (s.TableId == NextTableId - 1)
                        {
                            FirstTablePos = s.PositionId;
                        }
                        if (s.TableId == NextTableId)
                        {
                            SecondTablePos = s.PositionId;
                        }
                    }
                }

                if (SecondTablePos - FirstTablePos > 200)
                {
                    continue;
                }

                var tablerec = item.Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1];
                var row      = int.Parse(pos[1]);
                //第二张表,第一行存在NULL
                if (row == 1 && value == strNullValue)
                {
                    var table     = new HTMLTable(doc.root.TableList[NextTableId - 1]);
                    var nexttable = new HTMLTable(doc.root.TableList[NextTableId]);
                    if (table.ColumnCount != nexttable.ColumnCount)
                    {
                        continue;
                    }
                    MergeTable(doc, NextTableId);
                    Console.WriteLine("FirstRowNullFix");
                    break;
                }
            }
        }
    }
示例#10
0
    public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word)
    {
        if (String.IsNullOrEmpty(word))
        {
            return(String.Empty, String.Empty);
        }
        var fullname  = word.Replace(" ", String.Empty);
        var shortname = String.Empty;

        foreach (var companyname in doc.companynamelist)
        {
            if (companyname.secFullName == fullname)
            {
                //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性!
                if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName))
                {
                    shortname = companyname.secShortName;
                    break;
                }
            }
            if (companyname.secShortName == fullname)
            {
                fullname  = companyname.secFullName;
                shortname = companyname.secShortName;
                break;
            }
            //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下
            //简称和全称的关系
            if (companyname.secFullName.Contains(fullname) &&
                companyname.secFullName.Length > fullname.Length)
            {
                fullname  = companyname.secFullName;
                shortname = word;
            }
        }

        if (shortname == String.Empty)
        {
            shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName;
        }
        return(fullname, shortname);
    }
示例#11
0
    /// <summary>
    /// 实体分析
    /// </summary>
    /// <param name="doc"></param>
    public void Anlayze(AnnouceDocument doc)
    {
        ParagraghlocateDict.Clear();
        var nerlist = new List <LocAndValue <String> >();

        if (doc.Nerlist != null)
        {
            var ni = doc.Nerlist.Where(x => x.Type == enmNerType.Ni).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, ni.ToList(), "机构"));

            var ns = doc.Nerlist.Where(x => x.Type == enmNerType.Ns).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, ns.ToList(), "地名"));

            var nh = doc.Nerlist.Where(x => x.Type == enmNerType.Nh).Select(y => y.RawData).Distinct();
            nerlist.AddRange(LocateCustomerWord(doc.root, nh.ToList(), "人名"));
        }

        var FullNameList = doc.companynamelist.Select((x) => x.secFullName).ToList();

        FullNameList = FullNameList.Where(x => !String.IsNullOrEmpty(x)).Distinct().ToList();
        //补充公司名称
        nerlist.AddRange(LocateCustomerWord(doc.root, FullNameList, "公司名"));

        foreach (var paragragh in doc.root.Children)
        {
            foreach (var s in paragragh.Children)
            {
                var p = LocateParagraphInfo(doc, s.PositionId, nerlist);
                if (p.NerList.Count + p.moneylist.Count + p.datelist.Count + p.percentList.Count + p.socketNumberList.Count != 0)
                {
                    if (!ParagraghlocateDict.ContainsKey(s.PositionId))
                    {
                        ParagraghlocateDict.Add(s.PositionId, p);
                    }
                }
            }
        }
    }
示例#12
0
文件: NerMap.cs 项目: lxxwin/FDDC
    /// <summary>
    /// 每句句子中,各种实体的聚合
    /// </summary>
    /// <param name="PosId"></param>
    /// <returns></returns>
    ParagraghLoc LocateParagraphInfo(AnnouceDocument doc, int PosId, List <LocAndValue <String> > nerList)
    {
        var paragragh = new ParagraghLoc();

        paragragh.Init();
        foreach (var item in doc.datelist)
        {
            if (item.Loc == PosId)
            {
                paragragh.datelist.Add(item);
            }
        }
        foreach (var item in doc.moneylist)
        {
            if (item.Loc == PosId)
            {
                paragragh.moneylist.Add(item);
            }
        }
        foreach (var item in doc.quotationList)
        {
            if (item.Loc == PosId)
            {
                paragragh.NerList.Add(item);
            }
        }
        foreach (var item in nerList)
        {
            if (item.Loc == PosId)
            {
                paragragh.NerList.Add(item);
            }
        }

        return(paragragh);
    }
示例#13
0
    /// <summary>
    /// /// 分页表格的修复
    /// </summary>
    /// <param name="root"></param>
    public static void FixSpiltTable(MyRootHtmlNode root, AnnouceDocument doc)
    {
        for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++)
        {
            foreach (var item in doc.root.TableList[NextTableId])
            {
                var FirstTablePos  = -1;
                var SecondTablePos = -1;
                foreach (var p in root.Children)
                {
                    foreach (var s in p.Children)
                    {
                        if (s.TableId == NextTableId - 1)
                        {
                            FirstTablePos = s.PositionId;
                        }
                        if (s.TableId == NextTableId)
                        {
                            SecondTablePos = s.PositionId;
                        }
                    }
                }

                if (SecondTablePos - FirstTablePos > 200)
                {
                    continue;
                }

                var tablerec = item.Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1];
                var row      = int.Parse(pos[1]);
                //第二张表,第一行存在NULL
                if (row == 1 && value == strNullValue)
                {
                    var table     = new HTMLTable(doc.root.TableList[NextTableId - 1]);
                    var nexttable = new HTMLTable(doc.root.TableList[NextTableId]);
                    if (table.ColumnCount != nexttable.ColumnCount)
                    {
                        continue;
                    }
                    //合并表
                    var offset = table.RowCount;
                    //修改第二张表格的数据
                    foreach (var Nextitem in root.TableList[NextTableId])
                    {
                        tablerec = Nextitem.Split("|");
                        pos      = tablerec[0].Split(",");
                        value    = tablerec[1];
                        var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value;
                        root.TableList[NextTableId - 1].Add(newtablerec);
                    }
                    root.TableList[NextTableId].Clear();
                    for (int i = 0; i < root.Children.Count; i++)
                    {
                        for (int j = 0; j < root.Children[i].Children.Count; j++)
                        {
                            var node = root.Children[i].Children[j];
                            if (node.TableId == NextTableId)
                            {
                                node.TableId = -1;
                            }
                        }
                    }
                    break;
                }
            }
        }

        //1.是否存在连续表格 NextBrother
        for (int i = 0; i < root.Children.Count; i++)
        {
            for (int j = 0; j < root.Children[i].Children.Count; j++)
            {
                var node = root.Children[i].Children[j];
                if (node.TableId != -1)
                {
                    if (node.NextBrother != null)
                    {
                        if (node.NextBrother.TableId != -1)
                        {
                            var nextnode  = node.NextBrother;
                            var table     = new HTMLTable(root.TableList[node.TableId]);
                            var nexttable = new HTMLTable(root.TableList[nextnode.TableId]);
                            //Console.WriteLine("First  Table:" + table.RowCount + "X" + table.ColumnCount);
                            //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount);
                            if (table.ColumnCount != nexttable.ColumnCount)
                            {
                                continue;
                            }
                            //Console.WriteLine("Two Tables Has Same Column Count!");
                            //2.连续表格的后一个,往往是有<NULL>的行
                            bool hasnull = false;
                            for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++)
                            {
                                if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue)
                                {
                                    hasnull = true;
                                    break;
                                }
                            }

                            var ComboCompanyName         = "";
                            var ComboCompanyNameColumnNo = -1;
                            var CompanyFullNameList      = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList();
                            //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!!
                            int MaxColumn = table.ColumnCount;
                            for (int col = 1; col <= MaxColumn; col++)
                            {
                                int TableAMaxRow = table.RowCount;
                                int TableBMaxRow = nexttable.RowCount;
                                for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++)
                                {
                                    for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++)
                                    {
                                        var valueA = table.CellValue(RowCntA, col).Replace(" ", "");
                                        var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", "");
                                        if (valueA != "" && valueB != "")
                                        {
                                            var value = valueA + valueB;
                                            if (CompanyFullNameList.Contains(value))
                                            {
                                                ComboCompanyName         = value;
                                                ComboCompanyNameColumnNo = col;
                                                //Console.WriteLine("Found FullName:" + value);
                                                break;
                                            }
                                        }
                                    }
                                    if (ComboCompanyNameColumnNo != -1)
                                    {
                                        break;
                                    }
                                }
                                if (ComboCompanyNameColumnNo != -1)
                                {
                                    break;
                                }
                            }
                            if (ComboCompanyNameColumnNo != -1)
                            {
                                //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾
                                for (int k = 0; k < root.TableList[node.TableId].Count; k++)
                                {
                                    var tablerec = root.TableList[node.TableId][k].Split("|");
                                    var value    = tablerec[1].Replace(" ", "");
                                    //A表以公司名开头
                                    if (ComboCompanyName.StartsWith(value))
                                    {
                                        root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName;
                                    }
                                }
                                for (int k = 0; k < root.TableList[nextnode.TableId].Count; k++)
                                {
                                    var tablerec = root.TableList[nextnode.TableId][k].Split("|");
                                    var value    = tablerec[1].Replace(" ", "");
                                    //A表以公司名开头
                                    if (ComboCompanyName.EndsWith(value))
                                    {
                                        root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName;
                                    }
                                }
                            }


                            //特殊业务处理:增减持
                            bool specaillogic = false;
                            var  BuyMethod = new string[] { "集中竞价交易", "竞价交易", "大宗交易", "约定式购回" }.ToList();
                            if (doc.GetType() == typeof(StockChange))
                            {
                                //增减持无表头的特殊处理
                                for (int spCell = 1; spCell <= table.ColumnCount; spCell++)
                                {
                                    if (BuyMethod.Contains(nexttable.CellValue(1, spCell)))
                                    {
                                        specaillogic = true;
                                        break;
                                    }
                                }
                            }

                            if (hasnull || ComboCompanyNameColumnNo != -1 || specaillogic)
                            {
                                var offset = table.RowCount;
                                //修改第二张表格的数据
                                foreach (var item in root.TableList[nextnode.TableId])
                                {
                                    var tablerec    = item.Split("|");
                                    var pos         = tablerec[0].Split(",");
                                    var value       = tablerec[1];
                                    var newtablerec = node.TableId + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value;
                                    root.TableList[node.TableId].Add(newtablerec);
                                }
                                root.TableList[nextnode.TableId].Clear();
                                nextnode.TableId = -1;
                                //Console.WriteLine("Found Split Tables!!");
                            }
                        }
                    }
                }
            }
        }
    }
示例#14
0
    /// <summary>
    /// /// 分页表格的修复
    /// </summary>
    /// <param name="root"></param>
    public static void FixSpiltTable(AnnouceDocument doc)
    {
        //首行NULL的合并
        FirstRowNullFix(doc);

        OneRowFix(doc);

        for (int i = 0; i < doc.root.Children.Count; i++)
        {
            for (int j = 0; j < doc.root.Children[i].Children.Count; j++)
            {
                var node = doc.root.Children[i].Children[j];
                if (node.TableId != -1)
                {
                    if (node.NextBrother != null)
                    {
                        if (node.NextBrother.TableId != -1)
                        {
                            //1.是否存在连续表格 NextBrother
                            var nextnode  = node.NextBrother;
                            var table     = new HTMLTable(doc.root.TableList[node.TableId]);
                            var nexttable = new HTMLTable(doc.root.TableList[nextnode.TableId]);
                            //Console.WriteLine("First  Table:" + table.RowCount + "X" + table.ColumnCount);
                            //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount);
                            if (table.ColumnCount != nexttable.ColumnCount)
                            {
                                continue;
                            }
                            //Console.WriteLine("Two Tables Has Same Column Count!");
                            //2.连续表格的后一个,往往是有<NULL>的行
                            bool hasnull = false;
                            for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++)
                            {
                                if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue)
                                {
                                    hasnull = true;
                                    break;
                                }
                            }

                            var ComboCompanyName         = "";
                            var ComboCompanyNameColumnNo = -1;
                            var CompanyFullNameList      = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList();
                            //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!!
                            int MaxColumn = table.ColumnCount;
                            for (int col = 1; col <= MaxColumn; col++)
                            {
                                int TableAMaxRow = table.RowCount;
                                int TableBMaxRow = nexttable.RowCount;
                                for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++)
                                {
                                    for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++)
                                    {
                                        var valueA = table.CellValue(RowCntA, col).Replace(" ", "");
                                        var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", "");
                                        if (valueA != "" && valueB != "")
                                        {
                                            var value = valueA + valueB;
                                            if (CompanyFullNameList.Contains(value))
                                            {
                                                ComboCompanyName         = value;
                                                ComboCompanyNameColumnNo = col;
                                                //Console.WriteLine("Found FullName:" + value);
                                                break;
                                            }
                                        }
                                    }
                                    if (ComboCompanyNameColumnNo != -1)
                                    {
                                        break;
                                    }
                                }
                                if (ComboCompanyNameColumnNo != -1)
                                {
                                    break;
                                }
                            }
                            if (ComboCompanyNameColumnNo != -1)
                            {
                                //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾
                                for (int k = 0; k < doc.root.TableList[node.TableId].Count; k++)
                                {
                                    var tablerec = doc.root.TableList[node.TableId][k].Split("|");
                                    var value    = tablerec[1].Replace(" ", "");
                                    //A表以公司名开头
                                    if (ComboCompanyName.StartsWith(value))
                                    {
                                        doc.root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName;
                                    }
                                }
                                for (int k = 0; k < doc.root.TableList[nextnode.TableId].Count; k++)
                                {
                                    var tablerec = doc.root.TableList[nextnode.TableId][k].Split("|");
                                    var value    = tablerec[1].Replace(" ", "");
                                    //A表以公司名开头
                                    if (ComboCompanyName.EndsWith(value))
                                    {
                                        doc.root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName;
                                    }
                                }
                            }
                            if (hasnull || ComboCompanyNameColumnNo != -1)
                            {
                                MergeTable(doc, nextnode.TableId);
                            }
                        }
                    }
                }
            }
        }
    }
示例#15
0
    public static List <String> GetProjectNameByNer(AnnouceDocument doc)
    {
        //由于结果是多个XML构成的
        //1.掉所有的<?xml version="1.0" encoding="utf-8" ?>
        //2.加入<sentence></sentence> root节点
        var ProjList = new List <String>();

        if (!File.Exists(doc.NerXMLFileName))
        {
            return(ProjList);
        }
        var sr = new StreamReader(doc.NerXMLFileName);
        List <struWordNER> wl = null;
        var pl  = new List <List <struWordNER> >();
        var ner = String.Empty;

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine().Trim();
            if (line.StartsWith("<sent"))
            {
                if (wl != null)
                {
                    pl.Add(wl);
                }
                //一个新的句子
                wl = new List <struWordNER>();
            }
            if (line.StartsWith("<word"))
            {
                var word = new struWordNER(line);
                wl.Add(word);
            }
        }
        if (wl != null)
        {
            pl.Add(wl);
        }
        sr.Close();
        var proj = String.Empty;

        foreach (var p in pl)
        {
            for (int ScanIdx = 0; ScanIdx < p.Count; ScanIdx++)
            {
                var word = p[ScanIdx];
                if (word.ne == "B-Ns" || word.ne == "S-Ns" ||
                    word.cont == "新建")
                {
                    //遇到地名开始或者单独地名,加入到项目字符中
                    if (!string.IsNullOrEmpty(proj) && proj.StartsWith("新建"))
                    {
                        proj += word.cont;
                    }
                    else
                    {
                        proj = word.cont;
                    }
                }
                else
                {
                    if (word.cont.Equals("项目") ||
                        word.cont.Equals("工程") ||
                        word.cont.Equals("标段") ||
                        word.cont.Equals("采购"))
                    {
                        if (!String.IsNullOrEmpty(proj))
                        {
                            proj += word.cont;
                            var FurtherTo = Math.Min(p.Count, ScanIdx + 5);
                            var ShardProj = proj;

                            //标段的后检索
                            if (word.cont == "标段")
                            {
                                //检查之后3个词汇的距离是否存在项目,工程,承包
                                for (int TrailingScanIdx = ScanIdx + 1;
                                     TrailingScanIdx < FurtherTo;
                                     TrailingScanIdx++)
                                {
                                    ShardProj += p[TrailingScanIdx].cont;
                                    if (p[TrailingScanIdx].cont == "项目" ||
                                        p[TrailingScanIdx].cont == "工程" ||
                                        p[TrailingScanIdx].cont == "承包")
                                    {
                                        proj = ShardProj;
                                        break;
                                    }
                                }
                            }

                            //工程
                            if (word.cont == "工程" || word.cont == "项目")
                            {
                                //检查之后3个词汇的距离是否存在项目,工程,承包
                                var isContranBrack = false;
                                for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo;
                                     TrailingScanIdx++)
                                {
                                    ShardProj += p[TrailingScanIdx].cont;
                                    if (p[TrailingScanIdx].cont.Trim() == "(")
                                    {
                                        isContranBrack = true;
                                    }
                                    if (p[TrailingScanIdx].cont.Trim() == ")")
                                    {
                                        isContranBrack = false;
                                    }
                                    if (p[TrailingScanIdx].cont == "标段")
                                    {
                                        ScanIdx = TrailingScanIdx;
                                        if (isContranBrack)
                                        {
                                            ShardProj += ")";
                                            ScanIdx++;
                                        }
                                        proj = ShardProj;
                                        break;
                                    }
                                }
                            }

                            //整体的再检查,是否下面一个单词还是工程,项目,标段
                            if (ScanIdx + 1 <= p.Count - 1)
                            {
                                if (p[ScanIdx + 1].cont == "工程" || p[ScanIdx + 1].cont == "项目" ||
                                    p[ScanIdx + 1].cont == "标段" || p[ScanIdx + 1].cont == "活动")
                                {
                                    proj += p[ScanIdx + 1].cont;
                                    ScanIdx++;
                                }
                            }

                            var isOK = true;
                            if (proj.Contains("重大工程"))
                            {
                                isOK = false;
                            }
                            if (proj.Length > 50)
                            {
                                isOK = false;
                            }
                            if (proj.Contains(";"))
                            {
                                isOK = false;
                            }
                            if (proj.Contains(""))
                            {
                                isOK = false;
                            }
                            if (isOK)
                            {
                                Console.WriteLine(doc.Id + " NER 发现工程:" + proj);
                                ProjList.Add(proj);
                            }
                            proj = string.Empty;
                        }
                    }
                    else
                    {
                        if (!String.IsNullOrEmpty(proj))
                        {
                            proj += word.cont;
                        }
                    }
                }
            }
        }
        return(ProjList.Distinct().ToList());
    }
示例#16
0
    /// <summary>
    /// 每句句子中,各种实体的聚合
    /// </summary>
    /// <param name="PosId"></param>
    /// <returns></returns>
    ParagraghLoc LocateParagraphInfo(AnnouceDocument doc, int PosId, List <LocAndValue <String> > nerList)
    {
        var paragragh = new ParagraghLoc();

        paragragh.Init();
        foreach (var item in doc.datelist)
        {
            if (item.Loc == PosId)
            {
                paragragh.datelist.Add(item);
            }
        }
        foreach (var item in doc.moneylist)
        {
            if (item.Loc == PosId)
            {
                paragragh.moneylist.Add(item);
            }
        }
        foreach (var item in doc.percentList)
        {
            if (item.Loc == PosId)
            {
                paragragh.percentList.Add(item);
            }
        }
        foreach (var item in doc.StockNumberList)
        {
            if (item.Loc == PosId)
            {
                paragragh.socketNumberList.Add(item);
            }
        }
        foreach (var item in doc.CustomerList)
        {
            if (item.Loc == PosId)
            {
                paragragh.CustomerList.Add(item);                       //加入CustomerList为了代码方便
            }
            if (item.Loc == PosId)
            {
                paragragh.NerList.Add(item);                            //加入NerList为了查找方法
            }
        }
        foreach (var item in doc.quotationList)
        {
            if (item.Loc == PosId)
            {
                paragragh.NerList.Add(item);
            }
        }
        foreach (var item in nerList)
        {
            if (item.Loc == PosId)
            {
                paragragh.NerList.Add(item);
            }
        }
        paragragh.NerList.Sort((x, y) => { return(x.StartIdx.CompareTo(y.StartIdx)); });
        return(paragragh);
    }
示例#17
0
    /// <summary>
    /// 检索
    /// </summary>
    /// <param name="paragragh"></param>
    /// <returns></returns>
    public static List <LocAndValue <String> > Search(AnnouceDocument doc, SearchRule rule)
    {
        var rtn = new List <LocAndValue <String> >();

        if (rule.BaseWord.Description.Count == 0)
        {
            //关键字,没有任何描述
            doc.CustomerList = LocateCustomerWord(doc.root, rule.BaseWord.Word, "关键字");
            doc.nermap.Anlayze(doc);
        }
        foreach (var paragragh in doc.nermap.ParagraghlocateDict.Values)
        {
            for (int baseIdx = 0; baseIdx < paragragh.NerList.Count; baseIdx++)
            {
                var evaluate = paragragh.NerList[baseIdx];
                if (!IsMatch(rule.BaseWord, evaluate))
                {
                    continue;
                }

                if (rule.SearchForward)
                {
                    //向前
                    for (int ScanIdx = baseIdx + 1; ScanIdx < paragragh.NerList.Count; ScanIdx++)
                    {
                        evaluate = paragragh.NerList[ScanIdx];
                        if (IsMatch(rule.Target, evaluate))
                        {
                            if (rule.Validator == null)
                            {
                                rtn.Add(evaluate);
                                break;
                            }
                            else
                            {
                                if (rule.Validator(evaluate))
                                {
                                    rtn.Add(evaluate);
                                    break;
                                }
                            }
                        }
                    }
                }
                else
                {
                    //向后
                    for (int ScanIdx = baseIdx - 1; ScanIdx > -1; ScanIdx--)
                    {
                        evaluate = paragragh.NerList[ScanIdx];
                        if (IsMatch(rule.Target, evaluate))
                        {
                            if (rule.Validator == null)
                            {
                                rtn.Add(evaluate);
                                break;
                            }
                            else
                            {
                                if (rule.Validator(evaluate))
                                {
                                    rtn.Add(evaluate);
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
        return(rtn);
    }
示例#18
0
    public void AnlayzeEntitySurroundWords(AnnouceDocument doc, string KeyWord)
    {
        //Program.Training.WriteLine("关键字:[" + KeyWord + "]");
        JiebaSegmenter segmenter = new JiebaSegmenter();

        segmenter.AddWord(KeyWord);
        PosSegmenter posSeg = new PosSegmenter(segmenter);

        foreach (var paragrah in doc.root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var segments = posSeg.Cut(sentence.Content).ToList();  // 默认为精确模式,寻找关键字的位置
                for (int i = 0; i < segments.Count; i++)
                {
                    if (segments[i].Word.Equals(KeyWord))
                    {
                        //前5个词语和后五个词语
                        var startInx = Math.Max(0, i - 5);
                        var EndInx   = Math.Min(i + 5, segments.Count);
                        for (int s = startInx; s < i; s++)
                        {
                            if (segments[s].Flag == LTPTrainingNER.词性标点 && segments[s].Word != ":")
                            {
                                continue;
                            }
                            if (LeadingWordDict.ContainsKey(segments[s].Word))
                            {
                                LeadingWordDict[segments[s].Word]++;
                            }
                            else
                            {
                                LeadingWordDict.Add(segments[s].Word, 1);
                            }
                            //Program.Training.WriteLine("前导关键字:[" + segments[s] + "]");

                            //特别关注动词和冒号的情况
                            if (segments[s].Flag == LTPTrainingNER.动词)
                            {
                                if (LeadingVerbWordDict.ContainsKey(segments[s].Word))
                                {
                                    LeadingVerbWordDict[segments[s].Word]++;
                                }
                                else
                                {
                                    LeadingVerbWordDict.Add(segments[s].Word, 1);
                                }
                                //Program.Training.WriteLine("前导动词:" + segments[s].Word);
                            }
                        }
                        //Program.Training.WriteLine("关键字:[" + KeyWord + "]");
                        for (int s = i + 1; s < EndInx; s++)
                        {
                            if (segments[s].Flag == LTPTrainingNER.词性标点)
                            {
                                continue;
                            }
                            if (TrailingWordDict.ContainsKey(segments[s].Word))
                            {
                                TrailingWordDict[segments[s].Word]++;
                            }
                            else
                            {
                                TrailingWordDict.Add(segments[s].Word, 1);
                            }
                            //Program.Training.WriteLine("后续关键字:[" + segments[s] + "]");
                        }
                        break;     //仅统计第一次出现
                    }
                }
            }
        }
        segmenter.DeleteWord(KeyWord);
    }
示例#19
0
    /// <summary>
    /// 使用公司名称填null值
    /// </summary>
    /// <param name="doc"></param>
    public static void FixNullValue(AnnouceDocument doc)
    {
        var CompanyFullNameList  = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList();
        var CompanyShortNameList = doc.companynamelist.Select((x) => { return(x.secShortName); }).Distinct().ToList();
        var CompanyPos           = new List <String>();

        for (int tableId = 1; tableId <= doc.root.TableList.Count; tableId++)
        {
            var tableCells = doc.root.TableList[tableId];
            for (int checkItemIdx = 0; checkItemIdx < tableCells.Count; checkItemIdx++)
            {
                var tablerec = tableCells[checkItemIdx].Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1].Replace(" ", "");
                var col      = int.Parse(pos[2]);
                if (CompanyFullNameList.Contains(value) || CompanyShortNameList.Contains(value))
                {
                    CompanyPos.Add(tableCells[checkItemIdx]);
                }
            }
            CompanyPos.Reverse();
            for (int fixIdx = 0; fixIdx < tableCells.Count; fixIdx++)
            {
                var nullvalue = tableCells[fixIdx].Split("|")[1];
                var nullcol   = int.Parse(tableCells[fixIdx].Split("|")[0].Split(",")[2]);
                var nullrow   = int.Parse(tableCells[fixIdx].Split("|")[0].Split(",")[1]);
                if (nullvalue.Equals(strNullValue))
                {
                    foreach (var item in CompanyPos)
                    {
                        //向上寻找最近的
                        var tablerec = item.Split("|");
                        var pos      = tablerec[0].Split(",");
                        var value    = tablerec[1].Replace(" ", "");
                        var col      = int.Parse(pos[2]);
                        var row      = int.Parse(pos[1]);
                        if (nullcol == col && nullrow > row)
                        {
                            tableCells[fixIdx] = tableCells[fixIdx].Split("|")[0] + "|" + value;
                            break;
                        }
                    }
                }
            }
        }



        for (int tableId = 1; tableId <= doc.root.TableList.Count; tableId++)
        {
            var table = doc.root.TableList[tableId];
            for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++)
            {
                var tablerec = table[checkItemIdx].Split("|");
                var pos      = tablerec[0].Split(",");
                var value    = tablerec[1].Replace(" ", "");
                var row      = int.Parse(pos[1]);
                var col      = int.Parse(pos[2]);
                if (value == strNullValue && row != 1)
                {
                    //上一行是RowSpan,或者下一行是RowSpan,则这行也是RowSpan
                    var pre = tableId.ToString() + "," + (row - 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue;
                    if (table.Contains(pre))
                    {
                        table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue;
                    }
                    else
                    {
                        var next = tableId.ToString() + "," + (row + 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue;
                        if (table.Contains(next))
                        {
                            table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue;
                        }
                    }
                }
            }
        }
    }
示例#20
0
    public void Extract(AnnouceDocument doc)
    {
        //纯关键字类型
        if (KeyWordMap.Count != 0)
        {
            var candidate = ExtractByKeyWordMap(doc.root);
            if (candidate.Count == 1)
            {
                WordMapResult = candidate.First();
            }
            if (candidate.Count > 1)
            {
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine("找到纯关键字类型两个关键字");
                }
            }
            return;
        }

        if (LeadingColonKeyWordList != null)
        {
            //按照规则,由固定先导词的,例如  [项目名:]
            //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器
            var ExtractorText = new ExtractPropertyByText();
            //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍
            ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList;
            ExtractorText.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                LeadingColonKeyWordCandidate.Add(PropertyValue);
            }

            var Extractor = new ExtractPropertyByHTML();
            Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList;
            Extractor.Extract(doc.root);
            foreach (var item in ExtractorText.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (LeadingColonKeyWordCandidatePreprocess != null)
                {
                    PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue);
                }
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                //TEXT里面有的,这里不重复添加了
                if (!LeadingColonKeyWordCandidate.Contains(PropertyValue))
                {
                    LeadingColonKeyWordCandidate.Add(PropertyValue);
                }
            }
        }

        //书名号和引号
        if (QuotationTrailingWordList != null)
        {
            //接下来《》,“” 优先
            foreach (var bracket in doc.quotationList)
            {
                foreach (var word in QuotationTrailingWordList)
                {
                    if (bracket.Value.EndsWith(word))
                    {
                        var PropertyValue = CheckCandidate(bracket.Value);
                        if (String.IsNullOrEmpty(PropertyValue))
                        {
                            continue;
                        }
                        if (!Program.IsMultiThreadMode)
                        {
                            Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                        }
                        QuotationTrailingCandidate.Add(PropertyValue);
                    }
                }
            }
        }

        //句法依存
        if (DpKeyWordList != null)
        {
            var ExtractDP = new ExtractPropertyByDP();
            ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist);
            foreach (var item in ExtractDP.CandidateWord)
            {
                var PropertyValue = CheckCandidate(item.Value);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                DpKeyWordCandidate.Add(PropertyValue);
            }
        }

        if (ExternalStartEndStringFeature != null)
        {
            var ExtractorTEXT = new ExtractPropertyByText();
            ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorTEXT.ExtractFromTextFile(doc.TextFileName);
            foreach (var item in ExtractorTEXT.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
            }

            //一部分无法提取TEXT的情况
            var ExtractorHTML = new ExtractPropertyByHTML();
            ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature;
            ExtractorHTML.Extract(doc.root);
            foreach (var item in ExtractorHTML.CandidateWord)
            {
                var PropertyValue = item.Value;
                if (ExternalStartEndStringFeatureCandidatePreprocess != null)
                {
                    PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue);
                }
                PropertyValue = CheckCandidate(PropertyValue);
                if (String.IsNullOrEmpty(PropertyValue))
                {
                    continue;
                }
                if (!Program.IsMultiThreadMode)
                {
                    Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]");
                }
                if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue))
                {
                    ExternalStartEndStringFeatureCandidate.Add(PropertyValue);
                }
            }
        }
    }
示例#21
0
    public static List <String> GetProjectNameByCutWord(AnnouceDocument doc)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <String>();

        foreach (var paragrah in doc.root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                if (string.IsNullOrEmpty(sentence.Content))
                {
                    continue;
                }
                var words         = posSeg.Cut(sentence.Content).ToList();
                var PreviewEndIdx = -1;
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    var FullName = String.Empty;
                    if (words[baseInd].Word == "项目" || words[baseInd].Word == "工程" ||
                        words[baseInd].Word == "标段" || words[baseInd].Word == "采购")
                    {
                        var IsMarkClosed = true;
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--)
                        {
                            //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体
                            //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名
                            if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word))
                            {
                                //注意,地名可能相连,例如:上海市嘉定
                                if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word)))
                                {
                                    continue;
                                }
                                FullName = String.Empty;
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    FullName += words[companyFullNameInd].Word;
                                }
                                if (IsMarkClosed)
                                {
                                    //皆大欢喜的局面
                                    PreviewEndIdx = baseInd;
                                    Console.WriteLine(doc.Id + "发现工程:" + FullName);
                                    namelist.Add(FullName);
                                    break;  //不要继续寻找地名了
                                }
                            }
                            if (words[NRIdx].Flag == LTPTrainingNER.词性标点)
                            {
                                if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")")
                                {
                                    break;
                                }
                                if (words[NRIdx].Word == ")")
                                {
                                    IsMarkClosed = false;                              //打开
                                }
                                if (words[NRIdx].Word == "(")
                                {
                                    IsMarkClosed = true;                               //关闭
                                }
                            }
                        }
                    }
                }
            }
        }
        return(namelist);
    }