コード例 #1
0
 public void Extract(MyRootHtmlNode root)
 {
     CandidateWord.Clear();
     //先导词列表
     if (LeadingColonKeyWordList.Length > 0)
     {
         ExtractByColonKeyWord(root);
     }
     //结尾词列表
     if (TrailingWordList.Length > 0)
     {
         ExtractByTrailingKeyWord(root);
     }
     //是否有符号包裹特征
     if (MarkFeature.Length > 0)
     {
         ExtractByMarkFeature(root);
     }
     //开始字符结束字符
     if (StartEndFeature.Length > 0)
     {
         ExtractByStartEndStringFeature(root);
     }
     //正则表达式检索
     if (RegularExpressFeature.Length > 0)
     {
         ExtractByRegularExpressFeature(root);
     }
 }
コード例 #2
0
    public void ExtractTextByInChineseBracketsColonKeyWord(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();

        for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
        {
            var line = lines[CurrentLineIdx];
            foreach (var word in LeadingColonKeyWordListInChineseBrackets)
            {
                var result = GetValueInChineseBracketsLeadingKeyWord(line, word);
                foreach (var item in result)
                {
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Loc   = CurrentLineIdx,
                        Value = item
                    });
                }
            }
        }
    }
コード例 #3
0
    void ExtractByStartEndStringFeature(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();
        foreach (var word in StartEndFeature)
        {
            for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
            {
                var line = lines[CurrentLineIdx];
                var list = RegularTool.GetMultiValueBetweenString(line, word.StartWith, word.EndWith);
                foreach (var item in list)
                {
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Value = item
                    });
                }
            }
            ;
        }
    }
コード例 #4
0
ファイル: ExtractPropertyByDP.cs プロジェクト: toby2o12/FDDC
 /// <summary>
 /// 使用开始结束关键字进行抽取
 /// </summary>
 /// <param name="keys"></param>
 /// <param name="dplist"></param>
 public void StartWithKey(List <DPKeyWord> keys, List <List <struWordDP> > dplist)
 {
     foreach (var key in keys)
     {
         foreach (var paragragh in dplist)
         {
             bool   isStart = false;
             string x       = String.Empty;
             foreach (var word in paragragh)
             {
                 if (word.cont == "。" || word.cont == ":" || word.cont == "," || word.cont == ";")
                 {
                     if (isStart)
                     {
                         x       = String.Empty;
                         isStart = false;
                         continue;
                     }
                 }
                 if (isStart)
                 {
                     if (word.relate == LTPTrainingDP.右附加关系)
                     {
                         continue;
                     }
                     x += word.cont;
                 }
                 if (key.StartWord.Contains(word.cont))
                 {
                     if (key.StartDPValue.Length == 0 || (key.StartDPValue.Length != 0 && key.StartDPValue.Contains(word.relate)))
                     {
                         isStart = true;
                     }
                 }
                 if (key.EndWord.Contains(word.cont))
                 {
                     if (key.EndDPValue.Length == 0 || (key.EndDPValue.Length != 0 && key.EndDPValue.Contains(word.relate)))
                     {
                         if (isStart)
                         {
                             CandidateWord.Add(new LocAndValue <string>()
                             {
                                 Value = x
                             });
                         }
                         isStart = false;
                         x       = String.Empty;
                     }
                 }
             }
         }
     }
 }
コード例 #5
0
    //候选词

    #region 常规文本
    public void ExtractFromTextFile(string filename)
    {
        if (!File.Exists(filename))
        {
            return;
        }
        CandidateWord.Clear();
        if (LeadingColonKeyWordList.Length > 0)
        {
            ExtractTextByColonKeyWord(filename);
        }
        if (StartEndFeature.Length > 0)
        {
            ExtractByStartEndStringFeature(filename);
        }
        if (LeadingColonKeyWordListInChineseBrackets.Length > 0)
        {
            ExtractTextByInChineseBracketsColonKeyWord(filename);
        }
    }
コード例 #6
0
 /// <summary>
 /// 检索流程方法
 /// </summary>
 /// <param name="root">HTML根</param>
 /// <param name="ExtractMethod">特定检索方法(HTML内容,候补词列表)</param>
 void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod)
 {
     foreach (var paragrah in root.Children)
     {
         //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复
         foreach (var contentNode in paragrah.Children)
         {
             if (contentNode.TableId == -1)
             {
                 //非表格
                 var candidate = ExtractMethod(contentNode.Content);
                 foreach (var item in candidate)
                 {
                     CandidateWord.Add(new LocAndValue <String>()
                     {
                         Loc   = contentNode.PositionId,
                         Value = item
                     });
                 }
             }
         }
     }
 }
コード例 #7
0
    public void ExtractTextByColonKeyWord(string filename)
    {
        var lines = new List <String>();
        var sr    = new StreamReader(filename);

        while (!sr.EndOfStream)
        {
            var line = sr.ReadLine();
            if (!String.IsNullOrEmpty(line))
            {
                lines.Add(line);
            }
        }
        sr.Close();

        for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++)
        {
            var line = lines[CurrentLineIdx].Replace(" ", "");
            foreach (var word in LeadingColonKeyWordList)
            {
                if (Utility.GetStringAfter(line, word) != String.Empty)
                {
                    var result = Utility.GetStringAfter(line, word);
                    if (string.IsNullOrEmpty(result))
                    {
                        continue;
                    }
                    CandidateWord.Add(new LocAndValue <string>()
                    {
                        Loc   = CurrentLineIdx,
                        Value = result
                    });
                    break;
                }
            }
        }
    }
コード例 #8
0
ファイル: Trie.cs プロジェクト: sandeepchauhan/libs
        // Assuming N as word length. Output can contain upto (26(N+1) - N + 25N + N + (N-1)) = 52N + 25 words.
        private HashSet<CandidateWord> GetCorrectionsWithEditDistanceOne(CandidateWord word)
        {
            HashSet<CandidateWord> ret = new HashSet<CandidateWord>();
            for (int i = word.StartIndex; i <= word.Word.Length; i++)
            {
                // Insertions
                foreach(char c in Constants.EnglishAlphabet)
                {
                    ret.Add(new CandidateWord(word.Word.Insert(i, c + ""), i+1));
                }
                // Replacements
                if (i != word.Word.Length)
                {
                    char[] arr = word.Word.ToCharArray();
                    foreach (char c in Constants.EnglishAlphabet)
                    {
                        if (c != word.Word[i])
                        {
                            arr[i] = c;
                            ret.Add(new CandidateWord(new string(arr), i+1));
                        }
                    }
                }
                // Deletions
                if (i != word.Word.Length)
                {
                    ret.Add(new CandidateWord(word.Word.Remove(i, 1), i));
                }
                // Transposes
                if (i < word.Word.Length - 1)
                {
                    char[] arr = word.Word.ToCharArray();
                    char tmp = arr[i];
                    arr[i] = arr[i + 1];
                    arr[i + 1] = tmp;
                    ret.Add(new CandidateWord(new string(arr), 0));
                }
            }

            return ret;
        }