public void Extract(MyRootHtmlNode root) { CandidateWord.Clear(); //先导词列表 if (LeadingColonKeyWordList.Length > 0) { ExtractByColonKeyWord(root); } //结尾词列表 if (TrailingWordList.Length > 0) { ExtractByTrailingKeyWord(root); } //是否有符号包裹特征 if (MarkFeature.Length > 0) { ExtractByMarkFeature(root); } //开始字符结束字符 if (StartEndFeature.Length > 0) { ExtractByStartEndStringFeature(root); } //正则表达式检索 if (RegularExpressFeature.Length > 0) { ExtractByRegularExpressFeature(root); } }
public void ExtractTextByInChineseBracketsColonKeyWord(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; foreach (var word in LeadingColonKeyWordListInChineseBrackets) { var result = GetValueInChineseBracketsLeadingKeyWord(line, word); foreach (var item in result) { CandidateWord.Add(new LocAndValue <string>() { Loc = CurrentLineIdx, Value = item }); } } } }
void ExtractByStartEndStringFeature(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); foreach (var word in StartEndFeature) { for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; var list = RegularTool.GetMultiValueBetweenString(line, word.StartWith, word.EndWith); foreach (var item in list) { CandidateWord.Add(new LocAndValue <string>() { Value = item }); } } ; } }
/// <summary> /// 使用开始结束关键字进行抽取 /// </summary> /// <param name="keys"></param> /// <param name="dplist"></param> public void StartWithKey(List <DPKeyWord> keys, List <List <struWordDP> > dplist) { foreach (var key in keys) { foreach (var paragragh in dplist) { bool isStart = false; string x = String.Empty; foreach (var word in paragragh) { if (word.cont == "。" || word.cont == ":" || word.cont == "," || word.cont == ";") { if (isStart) { x = String.Empty; isStart = false; continue; } } if (isStart) { if (word.relate == LTPTrainingDP.右附加关系) { continue; } x += word.cont; } if (key.StartWord.Contains(word.cont)) { if (key.StartDPValue.Length == 0 || (key.StartDPValue.Length != 0 && key.StartDPValue.Contains(word.relate))) { isStart = true; } } if (key.EndWord.Contains(word.cont)) { if (key.EndDPValue.Length == 0 || (key.EndDPValue.Length != 0 && key.EndDPValue.Contains(word.relate))) { if (isStart) { CandidateWord.Add(new LocAndValue <string>() { Value = x }); } isStart = false; x = String.Empty; } } } } } }
//候选词 #region 常规文本 public void ExtractFromTextFile(string filename) { if (!File.Exists(filename)) { return; } CandidateWord.Clear(); if (LeadingColonKeyWordList.Length > 0) { ExtractTextByColonKeyWord(filename); } if (StartEndFeature.Length > 0) { ExtractByStartEndStringFeature(filename); } if (LeadingColonKeyWordListInChineseBrackets.Length > 0) { ExtractTextByInChineseBracketsColonKeyWord(filename); } }
/// <summary> /// 检索流程方法 /// </summary> /// <param name="root">HTML根</param> /// <param name="ExtractMethod">特定检索方法(HTML内容,候补词列表)</param> void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod) { foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { //非表格 var candidate = ExtractMethod(contentNode.Content); foreach (var item in candidate) { CandidateWord.Add(new LocAndValue <String>() { Loc = contentNode.PositionId, Value = item }); } } } } }
public void ExtractTextByColonKeyWord(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx].Replace(" ", ""); foreach (var word in LeadingColonKeyWordList) { if (Utility.GetStringAfter(line, word) != String.Empty) { var result = Utility.GetStringAfter(line, word); if (string.IsNullOrEmpty(result)) { continue; } CandidateWord.Add(new LocAndValue <string>() { Loc = CurrentLineIdx, Value = result }); break; } } } }
// Assuming N as word length. Output can contain upto (26(N+1) - N + 25N + N + (N-1)) = 52N + 25 words. private HashSet<CandidateWord> GetCorrectionsWithEditDistanceOne(CandidateWord word) { HashSet<CandidateWord> ret = new HashSet<CandidateWord>(); for (int i = word.StartIndex; i <= word.Word.Length; i++) { // Insertions foreach(char c in Constants.EnglishAlphabet) { ret.Add(new CandidateWord(word.Word.Insert(i, c + ""), i+1)); } // Replacements if (i != word.Word.Length) { char[] arr = word.Word.ToCharArray(); foreach (char c in Constants.EnglishAlphabet) { if (c != word.Word[i]) { arr[i] = c; ret.Add(new CandidateWord(new string(arr), i+1)); } } } // Deletions if (i != word.Word.Length) { ret.Add(new CandidateWord(word.Word.Remove(i, 1), i)); } // Transposes if (i < word.Word.Length - 1) { char[] arr = word.Word.ToCharArray(); char tmp = arr[i]; arr[i] = arr[i + 1]; arr[i + 1] = tmp; ret.Add(new CandidateWord(new string(arr), 0)); } } return ret; }