Beispiel #1
0
        public string[] ChunkNPs(string[] tokens, string[] pos)
        {
            string[]      phrases    = NLPToolsController.GetInstance().GetPhraseChunker().Chunk(tokens, pos);
            List <string> candidates = new List <string>();
            string        phrase     = "";

            for (int n = 0; n < tokens.Length; n++)
            {
                if (phrases[n].Equals("B-NP"))
                {
                    phrase = tokens[n];
                    for (int m = n + 1; m < tokens.Length; m++)
                    {
                        if (phrases[m].Equals("I-NP"))
                        {
                            phrase = phrase + " " + tokens[m];
                        }
                        else
                        {
                            n = m;
                            break;
                        }
                    }
                    phrase = Regex.Replace(phrase, @"\s+", " ").Trim();
                    if (phrase.Length > 0)
                    {
                        candidates.Add(phrase);
                    }
                }
            }
            return(candidates.ToArray());
        }
Beispiel #2
0
        public Dictionary <string, ISet <string> > Extract(string content)
        {
            Dictionary <string, ISet <string> > nouns = new Dictionary <string, ISet <string> >();

            string[] tokens     = NLPToolsController.GetInstance().GetTokenizer().Tokenize(content);
            string[] pos        = NLPToolsController.GetInstance().GetPosTagger().Tag(tokens);
            string[] candidates = ChunkNPs(tokens, pos);
            foreach (string candidate in candidates)
            {
                string   cleanedCandidate = WordUtil.ApplyCharacterReplacement(candidate, RuntimeProperties.TERM_CLEAN_PATTERN);
                string[] splitCandidates  = WordUtil.ApplySplitList(cleanedCandidate);
                foreach (string splitCandidate in splitCandidates)
                {
                    string stopRemoved = WordUtil.ApplyTrimStopwords(splitCandidate, stopList, normalizer);
                    if (stopRemoved == null)
                    {
                        continue;
                    }

                    string original = stopRemoved;

                    string normalizedCandidate = normalizer.Normalize(stopRemoved.ToLower()).Trim();

                    string[] nElements = normalizedCandidate.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
                    if (nElements.Length < 1 || nElements.Length > RuntimeProperties.TERM_MAX_WORDS)
                    {
                        continue;
                    }
                    // TODO noun phrases starting with numerical digits are completely
                    // discarded but ones that are spelt out are included; this inconsistency
                    // should be fixed. Ideally numbers should be chopped off as stop words.
                    if (RuntimeProperties.TERM_IGNORE_DIGITS && WordUtil.ContainsDigit(normalizedCandidate))
                    {
                        continue;
                    }
                    if (!WordUtil.ContainsLetter(normalizedCandidate))
                    {
                        continue;
                    }
                    if (!WordUtil.HasReasonableNumChars(normalizedCandidate))
                    {
                        continue;
                    }
                    // TODO handle stem-changing irregular plurals correctly - their
                    // variants should be included but aren't
                    if (cleanedCandidate.ToLower().IndexOf(normalizedCandidate) != -1)
                    {
                        ISet <string> variants;
                        if (!nouns.TryGetValue(normalizedCandidate, out variants))
                        {
                            variants = new HashSet <string>();
                        }
                        variants.Add(original);
                        nouns[normalizedCandidate] = variants;
                    }
                }
            }
            return(nouns);
        }
Beispiel #3
0
 public static NLPToolsController GetInstance()
 {
     if (instance == null)
     {
         instance = new NLPToolsController();
     }
     return(instance);
 }
Beispiel #4
0
        public Dictionary <string, ISet <string> > Extract(Document document)
        {
            Dictionary <string, ISet <string> > result = new Dictionary <string, ISet <string> >();

            foreach (string sentence in NLPToolsController.GetInstance().GetSentenceDetector().SentenceDetect(document.GetContent()))
            {
                foreach (var pair in Extract(sentence))
                {
                    ISet <string> variants;
                    if (!result.TryGetValue(pair.Key, out variants))
                    {
                        variants = new HashSet <string>();
                    }
                    variants.UnionWith(pair.Value);
                    result[pair.Key] = variants;
                }
            }
            return(result);
        }