private HashSet <string> GetKeywordsFor(string taisho) { var taishoLines = _lines.FindAll(l => l.Substring(0, 8).Equals(taisho)); var keywords = new HashSet <string>(); string[] lineSplit; for (int i = 0; i < taishoLines.Count; i++) { string line = taishoLines[i]; string nextLine = ""; if (i + 1 <= taishoLines.Count - 1) { nextLine = taishoLines[i + 1]; } lineSplit = line.Split('║'); if (lineSplit.Length < 2) { continue; } string[] content = lineSplit[1].Split(' '); foreach (string cjk in content) { if (cjk.Length == 0) { continue; } string cleanCjk; if (cjk[cjk.Length - 1].Equals("-")) { cleanCjk = cjk.Substring(0, cjk.Length - 1); string firstWord = GetFirstWordOf(nextLine); nextLine = RemoveFirstWordFrom(nextLine, firstWord); cleanCjk += firstWord; } cleanCjk = RemoveNonCjkCharsFrom(cjk); if (!_dbTerms.Contains(cleanCjk)) { foreach (string c in Utf8.AsCodePoints(cleanCjk)) { if (!_nonCjk.Contains(c)) { keywords.Add(c); } } } else { keywords.Add(cleanCjk); } } } return(keywords); }
private Dictionary <string, List <string> > BuildConcordanceOfHeadwords() { var concordance = new Dictionary <string, List <string> >(); string[] lineSplit; foreach (string line in _lines) { lineSplit = line.Split('║'); if (lineSplit.Length < 2) { continue; } string[] content = lineSplit[1].Split(' '); foreach (string cjk in content) { if (!_dbTerms.Contains(cjk)) { foreach (string c in Utf8.AsCodePoints(cjk)) { if (!_nonCjk.Contains(c)) { if (!concordance.ContainsKey(c)) { concordance[c] = new List <string>(); } concordance[c].Add(line); } } } else { if (!_nonCjk.Contains(cjk)) { if (!concordance.ContainsKey(cjk)) { concordance[cjk] = new List <string>(); } concordance[cjk].Add(line); } } } } return(concordance); }
private Dictionary <string, List <string> > BuildConcordanceOfHeadwords(HashSet <string> keywords) { var concordance = new Dictionary <string, List <string> >(); string[] lineSplit; var lines = new List <string>(_lines); for (int i = 0; i < lines.Count; i++) { string line = lines[i]; string nextLine = ""; if (i + 1 <= lines.Count - 1) { nextLine = lines[i + 1]; } lineSplit = line.Split('║'); if (lineSplit.Length < 2) { continue; } string[] content = lineSplit[1].Split(' '); bool straddle; string cleanCjk; foreach (string cjk in content) { if (cjk.Length < 1) { continue; } cleanCjk = cjk; straddle = false; if (cjk[cjk.Length - 1].Equals('-')) { straddle = true; cleanCjk = cjk.TrimEnd('-'); string firstWord = GetFirstWordOf(nextLine); lines[i + 1] = RemoveFirstWordFrom(nextLine, firstWord); cleanCjk += firstWord; } cleanCjk = RemoveNonCjkCharsFrom(cleanCjk); if (!_dbTerms.Contains(cleanCjk)) { foreach (string c in Utf8.AsCodePoints(cleanCjk)) { if (!_nonCjk.Contains(c) && keywords.Contains(c)) { if (!concordance.ContainsKey(c)) { concordance[c] = new List <string>(); } concordance[c].Add(_lines[i]); } } } else { if (keywords.Contains(cleanCjk)) { if (!concordance.ContainsKey(cleanCjk)) { concordance[cleanCjk] = new List <string>(); } concordance[cleanCjk].Add(_lines[i]); if (straddle) { concordance[cleanCjk].Add(_lines[i + 1]); } } } } } return(concordance); }