コード例 #1
0
    public static void Anlayze(HTMLEngine.MyRootHtmlNode root, string KeyWord)
    {
        Console.WriteLine("关键字:[" + KeyWord + "]");

        foreach (var paragrah in root.Children)
        {
            var segments = segmenter.Cut(paragrah.FullText).ToList();  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            //寻找关键字的位置
            for (int i = 0; i < segments.Count; i++)
            {
                if (segments[i].Equals(KeyWord))
                {
                    //前5个词语和后五个词语
                    var startInx = Math.Max(0, i - 5);
                    var EndInx   = Math.Min(i + 5, segments.Count);
                    for (int s = startInx; s < i; s++)
                    {
                        Console.WriteLine("前导关键字:[" + segments[s] + "]");
                    }
                    Console.WriteLine("关键字:[" + KeyWord + "]");
                    for (int s = i + 1; s < EndInx; s++)
                    {
                        Console.WriteLine("后续关键字:[" + segments[s] + "]");
                    }
                }
            }
        }
    }
コード例 #2
0
ファイル: LocateProperty.cs プロジェクト: loooo139/gradute
    public static List <LocAndValue <String> > LocatePercent(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var   OrgString   = sentence.Content;
                var   BracketList = RegularTool.GetChineseBrackets(OrgString);
                Regex r           = new Regex(RegularTool.PercentExpress);
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    list.Add(new LocAndValue <String>()
                    {
                        Loc         = sentence.PositionId,
                        Description = "百分比",
                        Value       = item.Value,
                        StartIdx    = item.Index
                    });
                }
            }
        }
        return(list);
    }
コード例 #3
0
 //寻找同时含有关键字的列的表头
 public static void PutTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord)
 {
     foreach (var Table in root.TableList)
     {
         var t = new HTMLTable(Table.Value);
         for (int RowNo = 2; RowNo < t.RowCount; RowNo++)
         {
             //从第二行开始
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 if (t.CellValue(RowNo, ColNo).NormalizeKey().Equals(KeyWord.NormalizeKey()))
                 {
                     var title = t.CellValue(1, ColNo);
                     if (!TrainingTitleResult.ContainsKey(title))
                     {
                         TrainingTitleResult.Add(title, 1);
                     }
                     else
                     {
                         TrainingTitleResult[title]++;
                     }
                 }
             }
         }
     }
 }
コード例 #4
0
    /// <summary>
    /// 增减持训练
    /// </summary>
    /// <param name="TraningCnt">训练条数</param>
    public static void Traning(int TraningCnt = int.MaxValue)
    {
        var ChangeMethodTool = new TableAnlayzeTool();
        var PreviewId        = String.Empty;
        var PreviewRoot      = new HTMLEngine.MyRootHtmlNode();
        int Cnt = 0;

        foreach (var stockchange in TraningDataset.StockChangeList)
        {
            if (!PreviewId.Equals(stockchange.Id))
            {
                var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\增减持\html\" + stockchange.Id + ".html";
                PreviewRoot = new HTMLEngine().Anlayze(htmlfile, "");
                PreviewId   = stockchange.Id;
                Cnt++; if (Cnt == TraningCnt)
                {
                    break;
                }
            }
            ChangeMethodTool.PutValueTrainingItem(PreviewRoot, new string[] { "减持方式", "增持方式" }.ToList());
        }

        var rank = Utility.FindTop(10, ChangeMethodTool.TrainingValueResult);

        Program.Training.WriteLine("增减持方式");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }
    }
コード例 #5
0
ファイル: IncreaseStockTraning.cs プロジェクト: kimmow/FDDC
    public static void TrainingIncreaseTarget()
    {
        TraningDataset.InitIncreaseStock();
        var PreviewId   = "";
        var PreviewRoot = new HTMLEngine.MyRootHtmlNode();

        foreach (var increase in TraningDataset.IncreaseStockList)
        {
            if (PreviewId.Equals(increase.id))
            {
                var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\" + increase.id + ".html";
                PreviewRoot = HTMLEngine.Anlayze(htmlfile);
            }
            TableAnlayzeTool.PutTrainingItem(PreviewRoot, increase.PublishTarget);
        }

        var Rank = new List <int>();

        Rank = TableAnlayzeTool.TrainingTitleResult.Values.ToList();
        Rank.Sort();
        Rank.Reverse();
        var Top10 = Rank[9];

        foreach (var title in TableAnlayzeTool.TrainingTitleResult)
        {
            if (title.Value >= Top10)
            {
                Console.WriteLine(title.Key + ":" + title.Value);
            }
        }
    }
コード例 #6
0
    /// <summary>
    /// 自定义字符列表
    /// </summary>
    /// <param name="root"></param>
    /// <param name="CustomerWord"></param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateCustomerWord(HTMLEngine.MyRootHtmlNode root,
                                                                  List <String> CustomerWord, string description = "字符")
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                foreach (var word in CustomerWord)
                {
                    int ScanStartIdx = 0;
                    while (OrgString.IndexOf(word, ScanStartIdx) != -1)
                    {
                        list.Add(new LocAndValue <String>()
                        {
                            Loc         = sentence.PositionId,
                            Description = description,
                            Value       = word,
                            StartIdx    = OrgString.IndexOf(word, ScanStartIdx)
                        });
                        ScanStartIdx = OrgString.IndexOf(word, ScanStartIdx) + word.Length;
                    }
                }
            }
        }
        return(list);
    }
コード例 #7
0
ファイル: TableAnlayzeTool.cs プロジェクト: PeiYangLiu/FDDC
 /// <summary>
 /// 寻找含有关键字的列的表头
 /// </summary>
 /// <param name="root"></param>
 /// <param name="KeyWord"></param>
 public void PutTitleTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord)
 {
     foreach (var Table in root.TableList)
     {
         var t = new HTMLTable(Table.Value);
         for (int RowNo = 2; RowNo < t.RowCount; RowNo++)
         {
             //从第二行开始
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 var title = t.CellValue(1, ColNo).Replace(" ", "");
                 if (String.IsNullOrEmpty(title))
                 {
                     continue;
                 }
                 var value = t.CellValue(RowNo, ColNo);
                 if (Transform != null)
                 {
                     value = Transform(value, title);
                 }
                 if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult()))
                 {
                     if (!TrainingTitleResult.ContainsKey(title))
                     {
                         TrainingTitleResult.Add(title, 1);
                     }
                     else
                     {
                         TrainingTitleResult[title]++;
                     }
                 }
             }
         }
     }
 }
コード例 #8
0
    /// <summary>
    /// 引号和书名号内容提取
    /// </summary>
    /// <param name="root">原始HTML</param>
    /// <param name="IsSkipBracket">是否忽略括号内部的内容</param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateQuotation(HTMLEngine.MyRootHtmlNode root, bool IsSkipBracket = true)
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var   OrgString   = sentence.Content;
                var   BracketList = RegularTool.GetChineseBrackets(OrgString);
                Regex r           = new Regex(@"\《.*?\》");
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    bool IsContentInBracket = false;
                    foreach (var bracketItem in BracketList)
                    {
                        if (bracketItem.Contains(item.Value))
                        {
                            IsContentInBracket = true;
                            break;
                        }
                    }
                    if (IsSkipBracket && IsContentInBracket)
                    {
                        continue;
                    }
                    list.Add(new LocAndValue <String>()
                    {
                        Loc   = sentence.PositionId,
                        Type  = "字符",
                        Value = item.Value.Substring(1, item.Value.Length - 2)
                    });
                }
                r = new Regex(@"\“.*?\”");
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    bool IsContentInBracket = false;
                    foreach (var bracketItem in BracketList)
                    {
                        if (bracketItem.Contains(item.Value))
                        {
                            IsContentInBracket = true;
                            break;
                        }
                    }
                    if (IsSkipBracket && IsContentInBracket)
                    {
                        continue;
                    }
                    list.Add(new LocAndValue <String>()
                    {
                        Loc   = sentence.PositionId,
                        Type  = "字符",
                        Value = item.Value.Substring(1, item.Value.Length - 2)
                    });
                }
            }
        }
        return(list);
    }
コード例 #9
0
    //获得日期
    public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <DateTime> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty);
                var datelist = DateUtility.GetDate(OrgString);
                foreach (var strDate in datelist)
                {
                    var    DateNumberList = RegularTool.GetNumberList(strDate);
                    String Year = DateNumberList[0];
                    String Month = DateNumberList[1];
                    String Day = DateNumberList[2];
                    int    year; int month; int day;
                    if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                    {
                        list.Add(new LocAndValue <DateTime>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期",
                            Value = DateUtility.GetWorkDay(year, month, day)
                        });
                    }
                }
            }
        }
        return(list);
    }
コード例 #10
0
    /// <summary>
    /// 增发对象训练
    /// </summary>
    /// <param name="TraningCnt">训练条数</param>
    public static void Training(int TraningCnt = int.MaxValue)
    {
        var TargetTool         = new TableAnlayzeTool();
        var IncreaseNumberTool = new TableAnlayzeTool();

        IncreaseNumberTool.Transform = NumberUtility.NormalizerStockNumber;
        var IncreaseMoneyTool = new TableAnlayzeTool();

        IncreaseMoneyTool.Transform = MoneyUtility.Format;
        var PreviewId   = String.Empty;
        var PreviewRoot = new HTMLEngine.MyRootHtmlNode();
        int Cnt         = 0;

        foreach (var increase in TraningDataset.IncreaseStockList)
        {
            if (!PreviewId.Equals(increase.Id))
            {
                var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\定增\html\" + increase.Id + ".html";
                PreviewRoot = new HTMLEngine().Anlayze(htmlfile, "");
                PreviewId   = increase.Id;
                Cnt++; if (Cnt == TraningCnt)
                {
                    break;
                }
            }
            TargetTool.PutTitleTrainingItem(PreviewRoot, increase.PublishTarget);
            IncreaseNumberTool.PutTitleTrainingItem(PreviewRoot, increase.IncreaseNumber);
            IncreaseMoneyTool.PutTitleTrainingItem(PreviewRoot, increase.IncreaseMoney);
        }

        var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult);

        Program.Training.WriteLine("增发对象");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }

        rank = Utility.FindTop(10, IncreaseNumberTool.TrainingTitleResult);
        Program.Training.WriteLine("增发数量");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }

        rank = Utility.FindTop(10, IncreaseMoneyTool.TrainingTitleResult);
        Program.Training.WriteLine("增发金额");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }
    }
コード例 #11
0
    public static List <struCompanyName> GetCompanyNameByCutWordFromHTML(HTMLEngine.MyRootHtmlNode root)
    {
        var namelist = new List <struCompanyName>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                GetCompany(namelist, sentence.Content, sentence.PositionId);
            }
        }
        return(namelist);
    }
コード例 #12
0
ファイル: BussinessLogic.cs プロジェクト: kimmow/FDDC
    public static string GetCompanyFullName(HTMLEngine.MyRootHtmlNode root)
    {
        var Extractor = new EntityProperty();

        Extractor.TrailingWordList = new string[] { "公司董事会" };
        Extractor.Extract(root);
        Extractor.CandidateWord.Reverse();
        foreach (var item in Extractor.CandidateWord)
        {
            Program.Logger.WriteLine("全称:[" + item + "公司]");
            return(item);
        }
        return("");
    }
コード例 #13
0
ファイル: LocateProperty.cs プロジェクト: kimmow/FDDC
    //获得金额
    public static List <LocAndValue> LocateMoney(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = Utility.ConvertUpperDateToLittle(OrgString).Replace(" ", "");
                var Money = Utility.SeekMoney(OrgString);
            }
        }
        return(list);
    }
コード例 #14
0
    List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root)
    {
        var result = new List <string>();

        foreach (var item in KeyWordMap)
        {
            var cnt = ExtractPropertyByHTML.FindWordCnt(item.Key, root).Count;
            if (cnt > 0)
            {
                if (!result.Contains(item.Value))
                {
                    result.Add(item.Value);
                }
            }
        }
        return(result);
    }
コード例 #15
0
ファイル: EntityProperty.cs プロジェクト: toby2o12/FDDC
    List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root)
    {
        var result = new List <string>();

        foreach (var item in KeyWordMap)
        {
            var HasKey = ExtractPropertyByHTML.HasWord(item.Key, root);
            if (HasKey)
            {
                if (!result.Contains(item.Value))
                {
                    result.Add(item.Value);
                }
            }
        }
        return(result);
    }
コード例 #16
0
    public static void GetEvaluateMethodTitle(int TraningCnt = int.MaxValue)
    {
        var TargetTool  = new TableAnlayzeTool();
        var PreviewId   = String.Empty;
        var PreviewRoot = new HTMLEngine.MyRootHtmlNode();
        int Cnt         = 0;

        foreach (var ReOrg in TraningDataset.ReorganizationList)
        {
            if (!PreviewId.Equals(ReOrg.Id))
            {
                var htmlfile = Program.ReorganizationPath_TRAIN + Path.DirectorySeparatorChar + @"html" + Path.DirectorySeparatorChar + ReOrg.Id + ".html";
                if (!System.IO.File.Exists(htmlfile))
                {
                    continue;
                }
                PreviewRoot = new HTMLEngine().Anlayze(htmlfile, "");
                PreviewId   = ReOrg.Id;
                Cnt++; if (Cnt == TraningCnt)
                {
                    break;
                }
            }
            if (!String.IsNullOrEmpty(ReOrg.EvaluateMethod))
            {
                TargetTool.PutTitleTrainingItemWithCodition(PreviewRoot, ReOrg.EvaluateMethod, ReOrg.TargetCompany);
            }
        }

        var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult);

        Program.Training.WriteLine("评估方法");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }
        Program.Training.WriteLine("标的");
        rank = Utility.FindTop(10, TargetTool.TrainingTitleCondition);
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }
        Program.Training.Flush();
    }
コード例 #17
0
    /// <summary>
    /// 寻找表中交易对手的标题
    /// </summary>
    /// <param name="TraningCnt"></param>
    public static void GetTradeCompanyTitle(int TraningCnt = int.MaxValue)
    {
        var TargetTool  = new TableAnlayzeTool();
        var PreviewId   = String.Empty;
        var PreviewRoot = new HTMLEngine.MyRootHtmlNode();
        int Cnt         = 0;

        foreach (var ReOrg in TraningDataset.ReorganizationList)
        {
            if (!PreviewId.Equals(ReOrg.Id))
            {
                var htmlfile = Program.ReorganizationPath_TRAIN + @"\html\" + ReOrg.Id + ".html";
                if (!System.IO.File.Exists(htmlfile))
                {
                    continue;
                }
                PreviewRoot = new HTMLEngine().Anlayze(htmlfile, "");
                PreviewId   = ReOrg.Id;
                Cnt++; if (Cnt == TraningCnt)
                {
                    break;
                }
            }
            foreach (var item in ReOrg.TradeCompany.Split(Utility.SplitChar))
            {
                TargetTool.PutTitleTrainingItem(PreviewRoot, item);
            }
        }

        var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult);

        Program.Training.WriteLine("交易对象");
        foreach (var rec in rank)
        {
            Program.Training.WriteLine(rec.ToString());
        }

        foreach (var item in TargetTool.WholeHeaderRow)
        {
            Program.Training.WriteLine(item);
        }
        Program.Training.Flush();
    }
コード例 #18
0
ファイル: LocateProperty.cs プロジェクト: loooo139/gradute
    /// <summary>
    /// 股数
    /// </summary>
    /// <param name="root"></param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateStockNumber(HTMLEngine.MyRootHtmlNode root)
    {
        var targetRegular = new ExtractProperyBase.struRegularExpressFeature()
        {
            RegularExpress   = @"\d+(,\d+)+",
            TrailingWordList = new string[] { "股" }.ToList()
        };
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var ExpResult = ExtractPropertyByHTML.RegularExFinder(sentence.PositionId, sentence.Content, targetRegular, "|");
                list.AddRange(ExpResult);
            }
        }
        return(list);
    }
コード例 #19
0
ファイル: BussinessLogic.cs プロジェクト: kimmow/FDDC
    //固定搭配
    public static string GetCompanyShortName(HTMLEngine.MyRootHtmlNode root)
    {
        var companyList = new Dictionary <string, string>();
        //从第一行开始找到  有限公司 有限责任公司, 如果有简称的话Value是简称
        //股票简称:东方电气
        //东方电气股份有限公司董事会
        var Extractor = new EntityProperty();

        Extractor.LeadingWordList = new string[] { "股票简称", "证券简称" };
        Extractor.Extract(root);
        foreach (var item in Extractor.CandidateWord)
        {
            var ShortName = item.Replace(":", "").Replace(":", "").Trim();
            if (Utility.GetStringBefore(ShortName, "、") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, "、");
            }
            if (Utility.GetStringBefore(ShortName, ")") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, ")");
            }
            if (Utility.GetStringBefore(ShortName, "公告") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, "公告");
            }
            if (Utility.GetStringBefore(ShortName, "股票") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, "股票");
            }
            if (Utility.GetStringBefore(ShortName, "证券") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, "证券");
            }
            if (Utility.GetStringBefore(ShortName, " ") != "")
            {
                ShortName = Utility.GetStringBefore(ShortName, " ");
            }
            FDDC.Program.Logger.WriteLine("简称:[" + ShortName + "]");
            return(ShortName);
        }
        return("");
    }
コード例 #20
0
ファイル: LocateProperty.cs プロジェクト: loooo139/gradute
    /// <summary>
    /// 自定义字符列表
    /// </summary>
    /// <param name="root"></param>
    /// <param name="CustomerWord"></param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateCustomerWord(HTMLEngine.MyRootHtmlNode root,
                                                                  List <String> CustomerWord, string description = "字符")
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content.Replace(" ", "");
                foreach (var word in CustomerWord)
                {
                    if (String.IsNullOrEmpty(word))
                    {
                        continue;
                    }
                    int ScanStartIdx = 0;
                    int Count        = 0;
                    while (OrgString.IndexOf(word, ScanStartIdx) != -1)
                    {
                        list.Add(new LocAndValue <String>()
                        {
                            Loc         = sentence.PositionId,
                            Description = description,
                            Value       = word,
                            StartIdx    = OrgString.IndexOf(word, ScanStartIdx)
                        });
                        Count++;
                        if (Count > 5000)
                        {
                            //死循环的防止
                            Console.WriteLine("OrgString:" + OrgString);
                            Console.WriteLine("word:[" + word + "]");
                            throw new System.ArgumentException();
                        }
                        ScanStartIdx = OrgString.IndexOf(word, ScanStartIdx) + word.Length;
                    }
                }
            }
        }
        return(list);
    }
コード例 #21
0
ファイル: LocateProperty.cs プロジェクト: kimmow/FDDC
    //获得日期
    public static List <LocAndValue> LocateDate(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = Utility.ConvertUpperDateToLittle(OrgString).Replace(" ", "");
                if (!String.IsNullOrEmpty(RegularTool.GetDate(OrgString)))
                {
                    list.Add(new LocAndValue()
                    {
                        Loc = sentence.PositionId, Value = RegularTool.GetDate(OrgString)
                    });
                }
            }
        }
        return(list);
    }
コード例 #22
0
ファイル: EntityWordAnlayzeTool.cs プロジェクト: kimmow/FDDC
    public static void AnlayzeEntitySurroundWords(HTMLEngine.MyRootHtmlNode root, string KeyWord)
    {
        Program.Training.WriteLine("关键字:[" + KeyWord + "]");
        JiebaSegmenter segmenter = new JiebaSegmenter();

        segmenter.AddWord(KeyWord);
        foreach (var paragrah in root.Children)
        {
            var segments = segmenter.Cut(paragrah.FullText.NormalizeKey()).ToList();  // 默认为精确模式
            //Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            //寻找关键字的位置
            for (int i = 0; i < segments.Count; i++)
            {
                if (segments[i].Equals(KeyWord))
                {
                    //前5个词语和后五个词语
                    var startInx = Math.Max(0, i - 5);
                    var EndInx   = Math.Min(i + 5, segments.Count);
                    for (int s = startInx; s < i; s++)
                    {
                        Program.Training.WriteLine("前导关键字:[" + segments[s] + "]");
                        if (segments[s] == ":")
                        {
                            var leading = "";
                            for (int l = startInx; l < s; l++)
                            {
                                leading += segments[l];
                            }
                            Console.WriteLine("冒号前导词:" + leading);
                        }
                    }
                    Program.Training.WriteLine("关键字:[" + KeyWord + "]");
                    for (int s = i + 1; s < EndInx; s++)
                    {
                        Program.Training.WriteLine("后续关键字:[" + segments[s] + "]");
                    }
                }
            }
        }
    }
コード例 #23
0
ファイル: TableAnlayzeTool.cs プロジェクト: loooo139/gradute
 /// <summary>
 /// 某类标题的值
 /// </summary>
 /// <param name="root"></param>
 /// <param name="KeyWord"></param>
 public void PutValueTrainingItem(HTMLEngine.MyRootHtmlNode root, List <string> TitleKeyWord)
 {
     foreach (var Table in root.TableList)
     {
         var t = new HTMLTable(Table.Value);
         for (int RowNo = 2; RowNo < t.RowCount; RowNo++)
         {
             //从第二行开始
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 var title = t.CellValue(1, ColNo).Replace(" ", "");
                 if (String.IsNullOrEmpty(title))
                 {
                     continue;
                 }
                 var value = t.CellValue(RowNo, ColNo).NormalizeTextResult();
                 if (string.IsNullOrEmpty(value))
                 {
                     continue;
                 }
                 foreach (var key in TitleKeyWord)
                 {
                     if (title.Equals(key))
                     {
                         if (!TrainingValueResult.ContainsKey(value))
                         {
                             TrainingValueResult.Add(value, 1);
                         }
                         else
                         {
                             TrainingValueResult[value]++;
                         }
                     }
                 }
             }
         }
     }
 }
コード例 #24
0
ファイル: BussinessLogic.cs プロジェクト: kimmow/FDDC
    //词法分析

    public static List <String> GetProjectName(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <String>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var words = posSeg.Cut(sentence.Content).ToList();
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    if (words[baseInd].Word == "标段" ||
                        words[baseInd].Word == "工程" ||
                        words[baseInd].Word == "项目")
                    {
                        var projectName = "";
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > -1; NRIdx--)
                        {
                            //地理
                            if (words[NRIdx].Flag == "ns")
                            {
                                projectName = "";
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    projectName += words[companyFullNameInd].Word;
                                }
                                namelist.Add(projectName);
                                break;  //不要继续寻找地名了
                            }
                        }
                    }
                }
            }
        }
        return(namelist);
    }
コード例 #25
0
    //获得日期
    public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty);
                var datelist = DateUtility.GetRangeDate(OrgString);
                foreach (var strDate in datelist)
                {
                    var      DateNumberList = RegularTool.GetNumberList(strDate);
                    DateTime ST             = new DateTime();
                    DateTime ED             = new DateTime();
                    if (DateNumberList.Count == 6)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Year  = DateNumberList[3];
                        Month = DateNumberList[4];
                        Day   = DateNumberList[5];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                    if (DateNumberList.Count == 5)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Month = DateNumberList[3];
                        Day   = DateNumberList[4];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                    if (DateNumberList.Count == 4)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Day = DateNumberList[3];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                }
            }
        }
        return(list);
    }
コード例 #26
0
    public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <struCompanyName>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var words = posSeg.Cut(sentence.Content).ToList();
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    var FullName  = "";
                    var ShortName = "";
                    if (words[baseInd].Word == "有限公司" ||
                        (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙"))
                    {
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > -1; NRIdx--)
                        {
                            //地理
                            if (words[NRIdx].Flag == "ns")
                            {
                                FullName = "";
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    FullName += words[companyFullNameInd].Word;
                                }
                                if (words[baseInd].Word == "有限")
                                {
                                    FullName += words[baseInd + 1].Word;
                                    FullName += words[baseInd + 2].Word;
                                }
                                break;  //不要继续寻找地名了
                            }
                        }
                        //是否能够在后面找到简称
                        for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++)
                        {
                            //地理
                            if (words[JCIdx].Word.Equals("简称"))
                            {
                                var ShortNameStart = -1;
                                var ShortNameEnd   = -1;
                                for (int ShortNameIdx = baseInd; ShortNameIdx < words.Count; ShortNameIdx++)
                                {
                                    if (words[ShortNameIdx].Word.Equals("“"))
                                    {
                                        ShortNameStart = ShortNameIdx + 1;
                                    }
                                    if (words[ShortNameIdx].Word.Equals("”"))
                                    {
                                        ShortNameEnd = ShortNameIdx - 1;
                                        break;
                                    }
                                }
                                if (ShortNameStart != -1 && ShortNameEnd != -1)
                                {
                                    ShortName = "";
                                    for (int i = ShortNameStart; i <= ShortNameEnd; i++)
                                    {
                                        ShortName += words[i].Word;
                                    }
                                }
                            }
                        }
                        if (FullName != "")
                        {
                            namelist.Add(new struCompanyName()
                            {
                                secFullName = FullName, secShortName = ShortName
                            });
                        }
                    }
                }
            }
        }
        return(namelist);
    }
コード例 #27
0
    //获得金额
    public static List <LocAndValue <(String MoneyAmount, String MoneyCurrency)> > LocateMoney(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <(String MoneyAmount, String MoneyCurrency)> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = MoneyUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty);
                var Money = MoneyUtility.SeekMoney(OrgString);
                foreach (var money in Money)
                {
                    list.Add(new LocAndValue <(String MoneyAmount, String MoneyCurrency)>
                    {
                        Loc   = sentence.PositionId,
                        Type  = "金额",
                        Value = money
                    });
                }
            }
        }
        return(list);
    }
コード例 #28
0
ファイル: BussinessLogic.cs プロジェクト: kimmow/FDDC
    public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <struCompanyName>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                if (string.IsNullOrEmpty(sentence.Content))
                {
                    continue;
                }
                var words         = posSeg.Cut(sentence.Content).ToList();
                var PreviewEndIdx = -1;
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    var FullName     = "";
                    var ShortName    = "";
                    var IsSubCompany = false;
                    var StartIdx     = -1;
                    if (
                        words[baseInd].Word == "有限公司" ||
                        (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") ||
                        (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙")
                        )
                    {
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--)
                        {
                            //地理
                            if (words[NRIdx].Flag == EntityWordAnlayzeTool.地名)
                            {
                                FullName = "";
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    FullName += words[companyFullNameInd].Word;
                                }

                                //承包公司
                                if (words[baseInd].Word == "公司")
                                {
                                    //什么都不用做
                                }

                                //(有限合伙)
                                if (words[baseInd].Word == "有限")
                                {
                                    FullName += words[baseInd + 1].Word;
                                    FullName += words[baseInd + 2].Word;
                                }
                                //子公司判断
                                if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司")
                                {
                                    IsSubCompany = true;
                                }
                                StartIdx      = NRIdx;
                                PreviewEndIdx = baseInd;
                                break;  //不要继续寻找地名了
                            }
                        }

                        //是否能够在后面找到简称
                        for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++)
                        {
                            //地理
                            if (words[JCIdx].Word.Equals("简称"))
                            {
                                var ShortNameStart = -1;
                                var ShortNameEnd   = -1;
                                for (int ShortNameIdx = baseInd; ShortNameIdx < words.Count; ShortNameIdx++)
                                {
                                    if (words[ShortNameIdx].Word.Equals("“"))
                                    {
                                        ShortNameStart = ShortNameIdx + 1;
                                    }
                                    if (words[ShortNameIdx].Word.Equals("”"))
                                    {
                                        ShortNameEnd = ShortNameIdx - 1;
                                        break;
                                    }
                                }
                                if (ShortNameStart != -1 && ShortNameEnd != -1)
                                {
                                    ShortName = "";
                                    for (int i = ShortNameStart; i <= ShortNameEnd; i++)
                                    {
                                        ShortName += words[i].Word;
                                    }
                                }
                            }
                        }
                        if (FullName != "")
                        {
                            namelist.Add(new struCompanyName()
                            {
                                secFullName  = FullName,
                                secShortName = ShortName,
                                isSubCompany = IsSubCompany,
                                positionId   = sentence.PositionId,
                                WordIdx      = StartIdx
                            });
                        }
                    }
                }
            }
        }
        return(namelist);
    }
コード例 #29
0
ファイル: TableAnlayzeTool.cs プロジェクト: loooo139/gradute
 /// <summary>
 /// 带条件的标题检索
 /// </summary>
 /// <param name="root"></param>
 /// <param name="KeyWord"></param>
 /// <param name="ConditionKey"></param>
 public void PutTitleTrainingItemWithCodition(HTMLEngine.MyRootHtmlNode root, string KeyWord, string ConditionKey)
 {
     if (root.TableList == null)
     {
         return;
     }
     foreach (var Table in root.TableList)
     {
         var t = new HTMLTable(Table.Value);
         for (int RowNo = 2; RowNo < t.RowCount; RowNo++)
         {
             var IsConditionOK  = false;
             var ConditionTitle = "";
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 var title = t.CellValue(1, ColNo).Replace(" ", "");
                 if (String.IsNullOrEmpty(title))
                 {
                     continue;
                 }
                 var value = t.CellValue(RowNo, ColNo);
                 if (value.NormalizeTextResult().Contains(ConditionKey.NormalizeTextResult()))
                 {
                     ConditionTitle = title;
                     IsConditionOK  = true;
                     break;
                 }
             }
             if (!IsConditionOK)
             {
                 continue;
             }
             //从第二行开始
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 var title = t.CellValue(1, ColNo).Replace(" ", "");
                 if (String.IsNullOrEmpty(title))
                 {
                     continue;
                 }
                 var value = t.CellValue(RowNo, ColNo);
                 if (Transform != null)
                 {
                     value = Transform(value, title);
                 }
                 if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult()))
                 {
                     if (!TrainingTitleResult.ContainsKey(title))
                     {
                         TrainingTitleResult.Add(title, 1);
                     }
                     else
                     {
                         TrainingTitleResult[title]++;
                     }
                     if (!TrainingTitleCondition.ContainsKey(ConditionTitle))
                     {
                         TrainingTitleCondition.Add(ConditionTitle, 1);
                     }
                     else
                     {
                         TrainingTitleCondition[ConditionTitle]++;
                     }
                 }
             }
         }
     }
 }
コード例 #30
0
    public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <struCompanyName>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                if (string.IsNullOrEmpty(sentence.Content))
                {
                    continue;
                }
                var words         = posSeg.Cut(sentence.Content).ToList();
                var PreviewEndIdx = -1;
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    var FullName     = String.Empty;
                    var ShortName    = String.Empty;
                    var IsSubCompany = false;
                    if (words[baseInd].Word == "国家电网" &&
                        (baseInd + 1) < words.Count &&
                        words[baseInd + 1].Word == "公司")
                    {
                        namelist.Add(new struCompanyName()
                        {
                            secFullName = "国家电网公司",
                            positionId  = sentence.PositionId,
                            WordIdx     = baseInd,
                            Score       = 100
                        });
                        continue;
                    }
                    if (
                        words[baseInd].Word == "有限公司" ||
                        (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "有限责任") ||
                        (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") ||
                        (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙")
                        )
                    {
                        //是否能够在后面找到简称
                        for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++)
                        {
                            //简称关键字
                            if (words[JCIdx].Word.Equals("简称") || words[JCIdx].Word.Equals("称"))
                            {
                                var ShortNameStart = -1;
                                var ShortNameEnd   = -1;
                                for (int ShortNameIdx = JCIdx; ShortNameIdx < words.Count; ShortNameIdx++)
                                {
                                    if (words[ShortNameIdx].Word.Equals("“"))
                                    {
                                        ShortNameStart = ShortNameIdx + 1;
                                    }
                                    if (words[ShortNameIdx].Word.Equals("”"))
                                    {
                                        ShortNameEnd = ShortNameIdx - 1;
                                        break;
                                    }
                                }
                                if (ShortNameStart != -1 && ShortNameEnd != -1)
                                {
                                    ShortName = String.Empty;
                                    for (int i = ShortNameStart; i <= ShortNameEnd; i++)
                                    {
                                        ShortName += words[i].Word;
                                    }
                                }
                                break;
                            }
                        }

                        var FirstShortNameWord = String.Empty;
                        if (ShortName.Length == 4)
                        {
                            FirstShortNameWord = ShortName.Substring(0, 2);
                        }
                        var IsMarkClosed      = true;
                        var CompanyStartIdx   = -1;
                        var FirstShortNameIdx = -1; //包含简称的位置
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--)
                        {
                            if (words[NRIdx].Word == FirstShortNameWord)
                            {
                                FirstShortNameIdx = NRIdx;   //备用
                            }
                            //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体
                            //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名
                            if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word))
                            {
                                //注意,地名可能相连,例如:上海市嘉定
                                if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word)))
                                {
                                    continue;
                                }
                                FullName = String.Empty;
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    FullName += words[companyFullNameInd].Word;
                                }
                                //(有限合伙)
                                if (words[baseInd].Word == "有限")
                                {
                                    FullName += words[baseInd + 1].Word;
                                    if ((baseInd + 2) < words.Count)
                                    {
                                        FullName += words[baseInd + 2].Word;
                                    }
                                }
                                //子公司判断
                                if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司")
                                {
                                    IsSubCompany = true;
                                }
                                if (IsMarkClosed)
                                {
                                    //皆大欢喜的局面
                                    CompanyStartIdx = NRIdx;
                                    PreviewEndIdx   = baseInd;
                                    break;  //不要继续寻找地名了
                                }
                            }
                            if (words[NRIdx].Flag == LTPTrainingNER.词性标点)
                            {
                                if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")")
                                {
                                    break;
                                }
                                if (words[NRIdx].Word == ")")
                                {
                                    IsMarkClosed = false;                              //打开
                                }
                                if (words[NRIdx].Word == "(")
                                {
                                    IsMarkClosed = true;                               //关闭
                                }
                            }
                        }

                        if (CompanyStartIdx == -1)
                        {
                            if (FirstShortNameIdx == -1)
                            {
                                continue;
                            }
                            if (posSeg.Cut(ShortName).First().Flag == LTPTrainingNER.地名)
                            {
                                continue;
                            }
                            FullName = String.Empty;
                            for (int NRIdx = FirstShortNameIdx; NRIdx <= baseInd; NRIdx++)
                            {
                                FullName += words[NRIdx].Word;
                            }
                            //(有限合伙)
                            if (words[baseInd].Word == "有限")
                            {
                                FullName += words[baseInd + 1].Word;
                                FullName += words[baseInd + 2].Word;
                            }
                            //子公司判断
                            if (FirstShortNameIdx != 0 && words[FirstShortNameIdx - 1].Word == "子公司")
                            {
                                IsSubCompany = true;
                            }
                        }

                        if (FullName != String.Empty)
                        {
                            FullName  = FullName.Replace(" ", String.Empty).Trim();
                            ShortName = ShortName.Replace(" ", String.Empty).Trim();
                            if (ShortName == "公司" || ShortName == "本公司")
                            {
                                ShortName = String.Empty;
                            }
                            if (ShortName == String.Empty)
                            {
                                var json = GetCompanyNameByFullName(FullName);
                                ShortName = json.secShortName;
                            }
                            namelist.Add(new struCompanyName()
                            {
                                secFullName  = FullName,
                                secShortName = ShortName,
                                isSubCompany = IsSubCompany,
                                positionId   = sentence.PositionId,
                                WordIdx      = CompanyStartIdx,
                                Score        = 100
                            });
                        }
                    }
                }
            }
        }
        return(namelist);
    }