public string Normalize(string value) { if (value.IndexOf(' ') == -1 || value.EndsWith(" s") || value.EndsWith("'s")) { return(lemmatizer.Lemmatize(value.Trim())); } int lastIndexOf = value.LastIndexOf(' '); string part1 = value.Substring(0, lastIndexOf); string part2 = value.Substring(lastIndexOf + 1); part2 = lemmatizer.Lemmatize(part2.Trim()); return(part1 + " " + part2); }
private DocumentVector vectorizeDocument(String htmlResult) { // Get term vector var lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); var documentVector = from s in _splitRegex.Split(htmlResult) where !String.IsNullOrWhiteSpace(s) let canonical = s.ToLower() where !_stopWords.Contains(canonical) && canonical.Length > 1 select lmtz.Lemmatize(s); return(new DocumentVector(documentVector)); }
private string Preprocess(string term, LemmatizerPrebuiltCompact lemmatizer) { string result; char[] termCharArray = term.ToCharArray(); //Remove non-alphanumeric letters termCharArray = Array.FindAll <char>(termCharArray, (ch => (char.IsLetterOrDigit(ch) || char.IsWhiteSpace(ch)))); string input = new string(termCharArray.Where(char.IsLetter).ToArray()); //string input = new string(termCharArray); //Remove newline character from a term input = input.Trim(new char[] { '\n' }); //Make all words lowercase input = input.ToLower(); //Lemmatize word result = lemmatizer.Lemmatize(input); return(result); }
static string getArticleBody(string url) { try { HtmlAgilityPack.HtmlWeb web = new HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(url); Console.WriteLine(url); string text = ""; string stemmed = ""; foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p/text()")) { text += Regex.Replace(node.InnerText, @"[^\w\s]", " ").ToLower(); } text = Services.StopwordTool.RemoveStopwords(text); //Get just the list of words string[] toStemSplit = text.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); //Load the Lemmatizer for English ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); foreach (string word in toStemSplit) { //Put the word in lower case; string wordLower = word.ToLower(); //Lemmatize the word to get the stem string lemma = lmtz.Lemmatize(wordLower); //Add it to the output stemmed += lemma + " "; } //Console.WriteLine("The stemmed article\n\n\n" + stemmed); return(stemmed); } catch { return("ERROR"); } }
static public List <Word> EqualWords(string[] str) { ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian); var words = (from s in str.AsParallel() where s.Length > 2 group s by lmtz.Lemmatize(s.ToLower()) into d let count = d.Count() select new { num = count, word = d.GroupBy(p => p.ToLower()).Where(p => count > 1), num1 = d.GroupBy(p => p.ToLower()).Where(p => count > 1).Count() }).OrderByDescending(p => p.num); List <Word> wordsList = new List <Word>(); foreach (var y in words) { foreach (var s in y.word) { wordsList.Add(new Word(s.Key, y.num, y.num1)); } } return(wordsList); }
static void Main(string[] args) { //Console.WriteLine("Input link: "); //String s = Console.ReadLine(); //Console.WriteLine("Input Directory to save: "); //String d = Console.ReadLine(); String startupPath = System.IO.Directory.GetCurrentDirectory(); //startupPath = Path.Combine(startupPath, d); dsFiles = Path.Combine(startupPath, "Data");//Path to source files. //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas /* STEP 1. PARSE HTML * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser()); * * parser.SetSettings(new NekdoSettings(1,100)); * parser.OnNewData += NewData; * parser.OnComplete += Complete; * dir = new DirectoryInfo(startupPath); * try{ * dir.Create(); * } * catch(IOException){ * Console.WriteLine("This directory has already exist. Continue work with this directory"); * } * parser.Start(); * while(parser.IsActive()){//awaiting parser... * * } * * * * CreateIndexF(parser.GetUrls()); */ //STEP 2 STEMMING /* * TrainDataParser TDP = new TrainDataParser(); * * * Lemmatization(TDP); * * Console.WriteLine(""); */ //STEP 3 CREATING INDEX. String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt"); Console.WriteLine("===STEP 3 ==="); IndexBuilder builder = new IndexBuilder(); Console.WriteLine("Source: {0} ", builder.Source); Console.WriteLine("Dest: {0}", indexFileP); LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5) foreach (KeyValuePair <String, IndexEntry> p in indexer) { Double I = Math.Round(100.0 / p.Value.Ids.Count, 5); p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0); foreach (Double prob in p.Value.Probs) { p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D) } //String data = p.Key +" : "+ p.Value; //__CreateIFile(indexFileP, data);//read Data from indexer to file. } Console.WriteLine("Done."); IStemmer stem = new RussianStemmer(); //STEMMER BoolSyntaxParser bp = new BoolSyntaxParser(); //PARSER OF BOOLEAN EXPRESSIONS ILemmatizer lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER. //STEP 4. BOOLEAN SEARCH BY(indexer) /* * while(true){ * Console.WriteLine("Input search str..."); * String ui = Console.ReadLine(); * * String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries); * LinkedStack<String> ui_w = bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION * * String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY * foreach(String it2 in ui_wa){ * Console.WriteLine(it2); * } * SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa); * __FindLinks(idsOf); * * }*/ //STEP 5 Vector SEARCH BY(indexer). ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT. Console.WriteLine("VECTOR SEARCH...\n"); while (true) { PQ.Clear(); Console.WriteLine("Input search str..."); String ui = Console.ReadLine(); Double[] score = new Double[101]; //Double[] lengths = new Double[101];//ST_C Double[] lengths = builder.GetLens();//ST_UC Double q_w = 0.0; String[] u = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries); foreach (String t in u) { IndexEntry te; if (indexer.TryGetValue(lemmer.Lemmatize(t), out te)) { q_w += te.IDF * te.IDF; Int32 i = 1; foreach (Int32 id in te.Ids) { score[id] += te.Weights[i]; //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C i++; } } } q_w = Math.Sqrt(q_w); if (q_w == 0.0) { Console.WriteLine("NOT FOUND"); } else { for (Int32 k = 1; k < 101; k++) { if (lengths[k - 1] == 0) //ST_C { continue; //ST_C } //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C //score[k] = score[k]/(lengths[k]*q_w);//ST_C score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0. if (score[k] == 0.0) { continue; } PQ.Add(new HeapEntry() { Relevance = 1d / score[k], Id = k }); //ASC ORDER } SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>(); Int32 KM = 5; while (!PQ.IsEmpty() && KM > 0) { HeapEntry et = PQ.DeleteMin(); Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance); docIds.Add(et.Id); KM--; } Console.WriteLine(""); __FindLinksV(docIds); } } }
private Keywords getKeywords(string data, int count) { string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous."; paragraph = paragraph.ToLower(); string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries); string[] swords = words.Where(x => !stopWordTest(x)).ToArray(); List <string> lwords = new List <string>(); ILemmatizer lemm = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); foreach (string word in swords) { if (word.Length == 1) { continue; } if (word.Length <= 3) { //Console.WriteLine(word); lwords.Add(word.ToLower()); } else { lwords.Add(lemm.Lemmatize(word)); } } List <string> fwords = new List <string>(); fwords = lwords.Where(x => !commonWordTest(x)).ToList(); //remove keyword // string sptr = textBox1.Text; sptr = sptr.ToLower(); // foreach (string sp in fwords) // if (sp==sptr) fwords.Remove(sp); // for (int i = 0; i < fwords.Count; i++) { if (fwords[i].Equals(sptr)) { fwords.Remove(fwords[i]); } } Dictionary <string, int> finallist = new Dictionary <string, int>(); var cwords = fwords.GroupBy(i => i); foreach (var w in cwords) { if (w.Count() > count) { finallist.Add(w.Key, w.Count()); textBox2.AppendText(w.Key + ": " + w.Count() + "\n"); Console.WriteLine("{0} {1}", w.Key, w.Count()); } } Keywords keys = new Keywords(); for (int i = 0; i < fwords.Count; i++) { if (finallist.ContainsKey(fwords[i])) { keys.addOcc(fwords[i], i); } } keys.words.Sort(sortWordsCount); return(keys); }