Пример #1
0
        private HashSet <string> GetKeywordsFor(string taisho)
        {
            var taishoLines = _lines.FindAll(l => l.Substring(0, 8).Equals(taisho));
            var keywords    = new HashSet <string>();

            string[] lineSplit;
            for (int i = 0; i < taishoLines.Count; i++)
            {
                string line     = taishoLines[i];
                string nextLine = "";
                if (i + 1 <= taishoLines.Count - 1)
                {
                    nextLine = taishoLines[i + 1];
                }

                lineSplit = line.Split('║');
                if (lineSplit.Length < 2)
                {
                    continue;
                }

                string[] content = lineSplit[1].Split(' ');

                foreach (string cjk in content)
                {
                    if (cjk.Length == 0)
                    {
                        continue;
                    }

                    string cleanCjk;
                    if (cjk[cjk.Length - 1].Equals("-"))
                    {
                        cleanCjk = cjk.Substring(0, cjk.Length - 1);
                        string firstWord = GetFirstWordOf(nextLine);
                        nextLine  = RemoveFirstWordFrom(nextLine, firstWord);
                        cleanCjk += firstWord;
                    }

                    cleanCjk = RemoveNonCjkCharsFrom(cjk);
                    if (!_dbTerms.Contains(cleanCjk))
                    {
                        foreach (string c in Utf8.AsCodePoints(cleanCjk))
                        {
                            if (!_nonCjk.Contains(c))
                            {
                                keywords.Add(c);
                            }
                        }
                    }
                    else
                    {
                        keywords.Add(cleanCjk);
                    }
                }
            }
            return(keywords);
        }
Пример #2
0
        private Dictionary <string, List <string> > BuildConcordanceOfHeadwords()
        {
            var concordance = new Dictionary <string, List <string> >();

            string[] lineSplit;
            foreach (string line in _lines)
            {
                lineSplit = line.Split('║');
                if (lineSplit.Length < 2)
                {
                    continue;
                }

                string[] content = lineSplit[1].Split(' ');

                foreach (string cjk in content)
                {
                    if (!_dbTerms.Contains(cjk))
                    {
                        foreach (string c in Utf8.AsCodePoints(cjk))
                        {
                            if (!_nonCjk.Contains(c))
                            {
                                if (!concordance.ContainsKey(c))
                                {
                                    concordance[c] = new List <string>();
                                }
                                concordance[c].Add(line);
                            }
                        }
                    }
                    else
                    {
                        if (!_nonCjk.Contains(cjk))
                        {
                            if (!concordance.ContainsKey(cjk))
                            {
                                concordance[cjk] = new List <string>();
                            }
                            concordance[cjk].Add(line);
                        }
                    }
                }
            }
            return(concordance);
        }
Пример #3
0
        private Dictionary <string, List <string> > BuildConcordanceOfHeadwords(HashSet <string> keywords)
        {
            var concordance = new Dictionary <string, List <string> >();

            string[] lineSplit;
            var      lines = new List <string>(_lines);

            for (int i = 0; i < lines.Count; i++)
            {
                string line     = lines[i];
                string nextLine = "";
                if (i + 1 <= lines.Count - 1)
                {
                    nextLine = lines[i + 1];
                }

                lineSplit = line.Split('║');
                if (lineSplit.Length < 2)
                {
                    continue;
                }

                string[] content = lineSplit[1].Split(' ');
                bool     straddle;
                string   cleanCjk;
                foreach (string cjk in content)
                {
                    if (cjk.Length < 1)
                    {
                        continue;
                    }
                    cleanCjk = cjk;
                    straddle = false;

                    if (cjk[cjk.Length - 1].Equals('-'))
                    {
                        straddle = true;
                        cleanCjk = cjk.TrimEnd('-');
                        string firstWord = GetFirstWordOf(nextLine);
                        lines[i + 1] = RemoveFirstWordFrom(nextLine, firstWord);
                        cleanCjk    += firstWord;
                    }

                    cleanCjk = RemoveNonCjkCharsFrom(cleanCjk);
                    if (!_dbTerms.Contains(cleanCjk))
                    {
                        foreach (string c in Utf8.AsCodePoints(cleanCjk))
                        {
                            if (!_nonCjk.Contains(c) && keywords.Contains(c))
                            {
                                if (!concordance.ContainsKey(c))
                                {
                                    concordance[c] = new List <string>();
                                }
                                concordance[c].Add(_lines[i]);
                            }
                        }
                    }
                    else
                    {
                        if (keywords.Contains(cleanCjk))
                        {
                            if (!concordance.ContainsKey(cleanCjk))
                            {
                                concordance[cleanCjk] = new List <string>();
                            }
                            concordance[cleanCjk].Add(_lines[i]);
                            if (straddle)
                            {
                                concordance[cleanCjk].Add(_lines[i + 1]);
                            }
                        }
                    }
                }
            }
            return(concordance);
        }