Exemple #1
0
    public void ExtractTextByInChineseBracketsColonKeyWord(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();

        for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
        {
            var line = lines[CurrentLineIdx];
            foreach (var word in LeadingColonKeyWordListInChineseBrackets)
            {
                var result = GetValueInChineseBracketsLeadingKeyWord(line, word);
                foreach (var item in result)
                {
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Loc   = CurrentLineIdx,
                        Value = item
                    });
                }
            }
        }
    }
Exemple #2
0
    void ExtractByStartEndStringFeature(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();
        foreach (var word in StartEndFeature)
        {
            for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
            {
                var line = lines[CurrentLineIdx];
                var list = RegularTool.GetMultiValueBetweenString(line, word.StartWith, word.EndWith);
                foreach (var item in list)
                {
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Value = item
                    });
                }
            }
            ;
        }
    }
Exemple #3
0
 /// <summary>
 /// 使用开始结束关键字进行抽取
 /// </summary>
 /// <param name="keys"></param>
 /// <param name="dplist"></param>
 public void StartWithKey(List <DPKeyWord> keys, List <List <struWordDP> > dplist)
 {
     foreach (var key in keys)
     {
         foreach (var paragragh in dplist)
         {
             bool   isStart = false;
             string x       = String.Empty;
             foreach (var word in paragragh)
             {
                 if (word.cont == "。" || word.cont == ":" || word.cont == "," || word.cont == ";")
                 {
                     if (isStart)
                     {
                         x       = String.Empty;
                         isStart = false;
                         continue;
                     }
                 }
                 if (isStart)
                 {
                     if (word.relate == LTPTrainingDP.右附加关系)
                     {
                         continue;
                     }
                     x += word.cont;
                 }
                 if (key.StartWord.Contains(word.cont))
                 {
                     if (key.StartDPValue.Length == 0 || (key.StartDPValue.Length != 0 && key.StartDPValue.Contains(word.relate)))
                     {
                         isStart = true;
                     }
                 }
                 if (key.EndWord.Contains(word.cont))
                 {
                     if (key.EndDPValue.Length == 0 || (key.EndDPValue.Length != 0 && key.EndDPValue.Contains(word.relate)))
                     {
                         if (isStart)
                         {
                             CandidateWord.Add(new LocAndValue <string>()
                             {
                                 Value = x
                             });
                         }
                         isStart = false;
                         x       = String.Empty;
                     }
                 }
             }
         }
     }
 }
Exemple #4
0
 /// <summary>
 /// 检索流程方法
 /// </summary>
 /// <param name="root">HTML根</param>
 /// <param name="ExtractMethod">特定检索方法(HTML内容,候补词列表)</param>
 void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod)
 {
     foreach (var paragrah in root.Children)
     {
         //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复
         foreach (var contentNode in paragrah.Children)
         {
             if (contentNode.TableId == -1)
             {
                 //非表格
                 var candidate = ExtractMethod(contentNode.Content);
                 foreach (var item in candidate)
                 {
                     CandidateWord.Add(new LocAndValue <String>()
                     {
                         Loc   = contentNode.PositionId,
                         Value = item
                     });
                 }
             }
         }
     }
 }
Exemple #5
0
    public void ExtractTextByColonKeyWord(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();

        for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
        {
            var line = lines[CurrentLineIdx].Replace(" ", "");
            foreach (var word in LeadingColonKeyWordList)
            {
                if (Utility.GetStringAfter(line, word) != String.Empty)
                {
                    var result = Utility.GetStringAfter(line, word);
                    if (string.IsNullOrEmpty(result))
                    {
                        continue;
                    }
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Loc   = CurrentLineIdx,
                        Value = result
                    });
                    break;
                }
            }
        }
    }