public void ExtractTextByInChineseBracketsColonKeyWord(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; foreach (var word in LeadingColonKeyWordListInChineseBrackets) { var result = GetValueInChineseBracketsLeadingKeyWord(line, word); foreach (var item in result) { CandidateWord.Add(new LocAndValue <string>() { Loc = CurrentLineIdx, Value = item }); } } } }
void ExtractByStartEndStringFeature(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); foreach (var word in StartEndFeature) { for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; var list = RegularTool.GetMultiValueBetweenString(line, word.StartWith, word.EndWith); foreach (var item in list) { CandidateWord.Add(new LocAndValue <string>() { Value = item }); } } ; } }
/// <summary> /// 使用开始结束关键字进行抽取 /// </summary> /// <param name="keys"></param> /// <param name="dplist"></param> public void StartWithKey(List <DPKeyWord> keys, List <List <struWordDP> > dplist) { foreach (var key in keys) { foreach (var paragragh in dplist) { bool isStart = false; string x = String.Empty; foreach (var word in paragragh) { if (word.cont == "。" || word.cont == ":" || word.cont == "," || word.cont == ";") { if (isStart) { x = String.Empty; isStart = false; continue; } } if (isStart) { if (word.relate == LTPTrainingDP.右附加关系) { continue; } x += word.cont; } if (key.StartWord.Contains(word.cont)) { if (key.StartDPValue.Length == 0 || (key.StartDPValue.Length != 0 && key.StartDPValue.Contains(word.relate))) { isStart = true; } } if (key.EndWord.Contains(word.cont)) { if (key.EndDPValue.Length == 0 || (key.EndDPValue.Length != 0 && key.EndDPValue.Contains(word.relate))) { if (isStart) { CandidateWord.Add(new LocAndValue <string>() { Value = x }); } isStart = false; x = String.Empty; } } } } } }
/// <summary> /// 检索流程方法 /// </summary> /// <param name="root">HTML根</param> /// <param name="ExtractMethod">特定检索方法(HTML内容,候补词列表)</param> void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod) { foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { //非表格 var candidate = ExtractMethod(contentNode.Content); foreach (var item in candidate) { CandidateWord.Add(new LocAndValue <String>() { Loc = contentNode.PositionId, Value = item }); } } } } }
public void ExtractTextByColonKeyWord(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx].Replace(" ", ""); foreach (var word in LeadingColonKeyWordList) { if (Utility.GetStringAfter(line, word) != String.Empty) { var result = Utility.GetStringAfter(line, word); if (string.IsNullOrEmpty(result)) { continue; } CandidateWord.Add(new LocAndValue <string>() { Loc = CurrentLineIdx, Value = result }); break; } } } }