private DocumentVector vectorizeDocument(String htmlResult)
 {
     // Get term vector
     var lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
     var documentVector = from s in _splitRegex.Split(htmlResult)
                          where !String.IsNullOrWhiteSpace(s)
                          let canonical = s.ToLower()
                          where !_stopWords.Contains(canonical) && canonical.Length > 1
                          select lmtz.Lemmatize(s);
     return new DocumentVector(documentVector);
 }
Exemplo n.º 2
0
        private Keywords getKeywords(string data, int count)
        {
            string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous.";

            paragraph = paragraph.ToLower();
            string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries);

            string[] swords = words.Where(x => !stopWordTest(x)).ToArray();
            List<string> lwords = new List<string>();
            ILemmatizer lemm = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
            foreach (string word in swords)
            {
                if (word.Length == 1)
                    continue;
                if (word.Length <= 3)
                {
                    //Console.WriteLine(word);
                    lwords.Add(word.ToLower());
                }
                else
                    lwords.Add(lemm.Lemmatize(word));

            }
            List<string> fwords = new List<string>();
            fwords = lwords.Where(x => !commonWordTest(x)).ToList();
            //remove keyword
            //
            string sptr = textBox1.Text;
            sptr = sptr.ToLower();
            // foreach (string sp in fwords)
            //   if (sp==sptr) fwords.Remove(sp);
            //
            for (int i = 0; i < fwords.Count; i++)
            {
                if (fwords[i].Equals(sptr))
                    fwords.Remove(fwords[i]);
            }

            Dictionary<string, int> finallist = new Dictionary<string, int>();
            var cwords = fwords.GroupBy(i => i);
            foreach (var w in cwords)
            {
                if (w.Count() > count)
                {

                    finallist.Add(w.Key, w.Count());
                    textBox2.AppendText(w.Key + ":  " + w.Count() + "\n");
                    Console.WriteLine("{0} {1}", w.Key, w.Count());

                }
            }

            Keywords keys = new Keywords();
            for (int i = 0; i < fwords.Count; i++)
            {
                if(finallist.ContainsKey(fwords[i]))
                    keys.addOcc(fwords[i], i);
            }
            keys.words.Sort(sortWordsCount);
            return keys;
        }