Esempio n. 1
0
        public string Normalize(string value)
        {
            if (value.IndexOf(' ') == -1 || value.EndsWith(" s") || value.EndsWith("'s"))
            {
                return(lemmatizer.Lemmatize(value.Trim()));
            }

            int    lastIndexOf = value.LastIndexOf(' ');
            string part1       = value.Substring(0, lastIndexOf);
            string part2       = value.Substring(lastIndexOf + 1);

            part2 = lemmatizer.Lemmatize(part2.Trim());
            return(part1 + " " + part2);
        }
Esempio n. 2
0
        private DocumentVector vectorizeDocument(String htmlResult)
        {
            // Get term vector
            var lmtz           = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
            var documentVector = from s in _splitRegex.Split(htmlResult)
                                 where !String.IsNullOrWhiteSpace(s)
                                 let canonical = s.ToLower()
                                                 where !_stopWords.Contains(canonical) && canonical.Length > 1
                                                 select lmtz.Lemmatize(s);

            return(new DocumentVector(documentVector));
        }
Esempio n. 3
0
        private string Preprocess(string term, LemmatizerPrebuiltCompact lemmatizer)
        {
            string result;

            char[] termCharArray = term.ToCharArray();
            //Remove non-alphanumeric letters
            termCharArray = Array.FindAll <char>(termCharArray, (ch => (char.IsLetterOrDigit(ch) || char.IsWhiteSpace(ch))));
            string input = new string(termCharArray.Where(char.IsLetter).ToArray());

            //string input = new string(termCharArray);
            //Remove newline character from a term
            input = input.Trim(new char[] { '\n' });
            //Make all words lowercase
            input = input.ToLower();
            //Lemmatize word
            result = lemmatizer.Lemmatize(input);

            return(result);
        }
    static string getArticleBody(string url)
    {
        try
        {
            HtmlAgilityPack.HtmlWeb      web = new HtmlWeb();
            HtmlAgilityPack.HtmlDocument doc = web.Load(url);
            Console.WriteLine(url);
            string text    = "";
            string stemmed = "";
            foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p/text()"))
            {
                text += Regex.Replace(node.InnerText, @"[^\w\s]", " ").ToLower();
            }

            text = Services.StopwordTool.RemoveStopwords(text);

            //Get just the list of words
            string[] toStemSplit = text.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            //Load the Lemmatizer for English
            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            foreach (string word in toStemSplit)
            {
                //Put the word in lower case;
                string wordLower = word.ToLower();
                //Lemmatize the word to get the stem
                string lemma = lmtz.Lemmatize(wordLower);
                //Add it to the output
                stemmed += lemma + " ";
            }

            //Console.WriteLine("The stemmed article\n\n\n" + stemmed);
            return(stemmed);
        }
        catch
        {
            return("ERROR");
        }
    }
Esempio n. 5
0
        static public List <Word> EqualWords(string[] str)
        {
            ILemmatizer lmtz  = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian);
            var         words = (from s in str.AsParallel()
                                 where s.Length > 2
                                 group s by lmtz.Lemmatize(s.ToLower()) into d
                                 let count = d.Count()
                                             select new
            {
                num = count,
                word = d.GroupBy(p => p.ToLower()).Where(p => count > 1),
                num1 = d.GroupBy(p => p.ToLower()).Where(p => count > 1).Count()
            }).OrderByDescending(p => p.num);
            List <Word> wordsList = new List <Word>();

            foreach (var y in words)
            {
                foreach (var s in y.word)
                {
                    wordsList.Add(new Word(s.Key, y.num, y.num1));
                }
            }
            return(wordsList);
        }
Esempio n. 6
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Input link: ");
            //String s = Console.ReadLine();
            //Console.WriteLine("Input Directory to save: ");
            //String d = Console.ReadLine();

            String startupPath = System.IO.Directory.GetCurrentDirectory();

            //startupPath = Path.Combine(startupPath, d);


            dsFiles = Path.Combine(startupPath, "Data");//Path to source files.
            //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas

            /* STEP 1. PARSE HTML
             * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser());
             *
             * parser.SetSettings(new NekdoSettings(1,100));
             * parser.OnNewData += NewData;
             * parser.OnComplete += Complete;
             * dir = new DirectoryInfo(startupPath);
             * try{
             *  dir.Create();
             * }
             * catch(IOException){
             *  Console.WriteLine("This directory has already exist. Continue work with this directory");
             * }
             * parser.Start();
             * while(parser.IsActive()){//awaiting parser...
             *
             * }
             *
             *
             *
             * CreateIndexF(parser.GetUrls());
             */


            //STEP 2 STEMMING

            /*
             * TrainDataParser TDP = new TrainDataParser();
             *
             *
             * Lemmatization(TDP);
             *
             * Console.WriteLine("");
             */
            //STEP 3 CREATING INDEX.
            String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt");

            Console.WriteLine("===STEP 3 ===");

            IndexBuilder builder = new IndexBuilder();

            Console.WriteLine("Source: {0} ", builder.Source);
            Console.WriteLine("Dest: {0}", indexFileP);


            LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX


            // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5)

            foreach (KeyValuePair <String, IndexEntry> p in indexer)
            {
                Double I = Math.Round(100.0 / p.Value.Ids.Count, 5);
                p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0);

                foreach (Double prob in p.Value.Probs)
                {
                    p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D)
                }
                //String data = p.Key +" : "+ p.Value;
                //__CreateIFile(indexFileP, data);//read Data from indexer to file.
            }

            Console.WriteLine("Done.");



            IStemmer         stem   = new RussianStemmer();                                    //STEMMER
            BoolSyntaxParser bp     = new BoolSyntaxParser();                                  //PARSER OF BOOLEAN EXPRESSIONS
            ILemmatizer      lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER.


            //STEP 4. BOOLEAN SEARCH BY(indexer)

            /*
             * while(true){
             *  Console.WriteLine("Input search str...");
             *  String ui = Console.ReadLine();
             *
             *  String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries);
             *  LinkedStack<String> ui_w =  bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION
             *
             *  String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY
             *  foreach(String it2 in ui_wa){
             *      Console.WriteLine(it2);
             *  }
             *  SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa);
             *  __FindLinks(idsOf);
             *
             * }*/


            //STEP 5 Vector SEARCH BY(indexer).

            ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT.

            Console.WriteLine("VECTOR SEARCH...\n");
            while (true)
            {
                PQ.Clear();
                Console.WriteLine("Input search str...");
                String   ui    = Console.ReadLine();
                Double[] score = new Double[101];
                //Double[] lengths = new Double[101];//ST_C
                Double[] lengths = builder.GetLens();//ST_UC
                Double   q_w     = 0.0;
                String[] u       = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (String t in u)
                {
                    IndexEntry te;
                    if (indexer.TryGetValue(lemmer.Lemmatize(t), out te))
                    {
                        q_w += te.IDF * te.IDF;
                        Int32 i = 1;
                        foreach (Int32 id in te.Ids)
                        {
                            score[id] += te.Weights[i];
                            //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C
                            i++;
                        }
                    }
                }
                q_w = Math.Sqrt(q_w);
                if (q_w == 0.0)
                {
                    Console.WriteLine("NOT FOUND");
                }
                else
                {
                    for (Int32 k = 1; k < 101; k++)
                    {
                        if (lengths[k - 1] == 0) //ST_C
                        {
                            continue;            //ST_C
                        }
                        //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C
                        //score[k] = score[k]/(lengths[k]*q_w);//ST_C
                        score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0.
                        if (score[k] == 0.0)
                        {
                            continue;
                        }
                        PQ.Add(new HeapEntry()
                        {
                            Relevance = 1d / score[k], Id = k
                        });                                                      //ASC ORDER
                    }
                    SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>();
                    Int32 KM = 5;
                    while (!PQ.IsEmpty() && KM > 0)
                    {
                        HeapEntry et = PQ.DeleteMin();
                        Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance);
                        docIds.Add(et.Id);
                        KM--;
                    }
                    Console.WriteLine("");
                    __FindLinksV(docIds);
                }
            }
        }
Esempio n. 7
0
        private Keywords getKeywords(string data, int count)
        {
            string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous.";

            paragraph = paragraph.ToLower();
            string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries);

            string[]      swords = words.Where(x => !stopWordTest(x)).ToArray();
            List <string> lwords = new List <string>();
            ILemmatizer   lemm   = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);

            foreach (string word in swords)
            {
                if (word.Length == 1)
                {
                    continue;
                }
                if (word.Length <= 3)
                {
                    //Console.WriteLine(word);
                    lwords.Add(word.ToLower());
                }
                else
                {
                    lwords.Add(lemm.Lemmatize(word));
                }
            }
            List <string> fwords = new List <string>();

            fwords = lwords.Where(x => !commonWordTest(x)).ToList();
            //remove keyword
            //
            string sptr = textBox1.Text;

            sptr = sptr.ToLower();
            // foreach (string sp in fwords)
            //   if (sp==sptr) fwords.Remove(sp);
            //
            for (int i = 0; i < fwords.Count; i++)
            {
                if (fwords[i].Equals(sptr))
                {
                    fwords.Remove(fwords[i]);
                }
            }

            Dictionary <string, int> finallist = new Dictionary <string, int>();
            var cwords = fwords.GroupBy(i => i);

            foreach (var w in cwords)
            {
                if (w.Count() > count)
                {
                    finallist.Add(w.Key, w.Count());
                    textBox2.AppendText(w.Key + ":  " + w.Count() + "\n");
                    Console.WriteLine("{0} {1}", w.Key, w.Count());
                }
            }

            Keywords keys = new Keywords();

            for (int i = 0; i < fwords.Count; i++)
            {
                if (finallist.ContainsKey(fwords[i]))
                {
                    keys.addOcc(fwords[i], i);
                }
            }
            keys.words.Sort(sortWordsCount);
            return(keys);
        }