static void Main(string[] args) { LanguagePrebuilt language = args != null && args.Length > 0 ? getLanguage(args[0]) : getLanguage(); ILemmatizer lemmatizer = new LemmatizerPrebuiltCompact(language); if (args.Length == 0 || args.Length == 1) { Console.WriteLine("Batch-processing all files contained in the subfolder 'lemma-source' into 'lemma-output'"); string[] fileList = FileTraverser.getFileList(); foreach (string file in fileList) { processFile(file, lemmatizer); } } else if (args.Length == 2) { Logger.logError("Missing argument"); } else if (args.Length == 3) { processFile(args[1], lemmatizer, args[2]); } else { Logger.logError("Argument count mismatch, expected max 3 received {0}", args.Length.ToString()); } }
/// <summary> /// Given a list of phrases, return the best matching phrase based on an input phrase. /// </summary> /// <returns>List of 2 best matches</returns> public static List <string> GetMatchingPhrase(string inputPhrase, List <string> phrases) { if (phrases?.Count == 0) { return(null); } // Lemmatize and remove stop-words from phrases var lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); var lemmaListPhrases = Lemmatize(phrases, lmtz); for (var i = 0; i < lemmaListPhrases.Count; i++) { var phrase = lemmaListPhrases[i]; lemmaListPhrases[i] = RemoveStopWords(phrase); } // Lemmatize and remove stop-words from inputPhrase var lemmaInputPhrase = string.Join(" ", LemmatizePhrase(lmtz, inputPhrase)); lemmaInputPhrase = RemoveStopWords(lemmaInputPhrase); lemmaInputPhrase = SubstituteWords(lemmaInputPhrase); // "your" => "my" // find the best match var matches = BestSetMatch(lemmaInputPhrase, lemmaListPhrases); var matchedPhrases = matches.Count == 0 ? new List <string> { "" } : matches.Take(2).Select(idx => phrases[idx]).ToList(); // paraphrase //var paraphrasedResult = Paraphrase(matchedPhrases[0]); return(matchedPhrases); }
static void Main(string [] args) { ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Czech); StreamWriter tw = new StreamWriter(File.OpenWrite("ExampleFile.txt")); Output(((Lemmatizer )lmtz).RootNode, tw, 0, false); }
private DocumentVector vectorizeDocument(String htmlResult) { // Get term vector var lmtz = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); var documentVector = from s in _splitRegex.Split(htmlResult) where !String.IsNullOrWhiteSpace(s) let canonical = s.ToLower() where !_stopWords.Contains(canonical) && canonical.Length > 1 select lmtz.Lemmatize(s); return(new DocumentVector(documentVector)); }
void Lemmatize(ref Story story) { ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); story.words = story.text.Split( new char[] { ' ', '\t', '\n', ',', '/', '\\', '?', '!', '<', '>', '\'', '|', ':', ';', ')', '(', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, StringSplitOptions.RemoveEmptyEntries); story.freq = new int[story.words.Length]; for (int j = 0; j < story.words.Length; j++) { LemmatizeOne(lmtz, ref story.words[j]); } }
public string send2(string text) { string[] exampleWords = text.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); StringBuilder sb = new StringBuilder(); foreach (string word in exampleWords) { sb.Append(LemmatizeOne(lmtz, word) + " "); } return(sb.ToString()); }
public LanguageSupport(string language) { switch (language) { case "English": Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); Stemmer = new EnglishStemmer(); break; case "Russian": Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian); Stemmer = new RussianStemmer(); break; default: throw new NotSupportedException("Language " + language + "is not suported."); } }
private string Preprocess(string term, LemmatizerPrebuiltCompact lemmatizer) { string result; char[] termCharArray = term.ToCharArray(); //Remove non-alphanumeric letters termCharArray = Array.FindAll <char>(termCharArray, (ch => (char.IsLetterOrDigit(ch) || char.IsWhiteSpace(ch)))); string input = new string(termCharArray.Where(char.IsLetter).ToArray()); //string input = new string(termCharArray); //Remove newline character from a term input = input.Trim(new char[] { '\n' }); //Make all words lowercase input = input.ToLower(); //Lemmatize word result = lemmatizer.Lemmatize(input); return(result); }
public string send(string text) { string[] exampleWords = text.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); StringBuilder sb = new StringBuilder(); foreach (string word in exampleWords) { sb.Append(LemmatizeOne(lmtz, word) + " "); } string finalstring = sb.ToString(); var jarRoot = @"E:\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09"; var modelsDirectory = jarRoot + @"\models"; // Loading POS Tagger var tagger = new MaxentTagger(modelsDirectory + @"\wsj-0-18-bidirectional-nodistsim.tagger"); // Text for tagging StringBuilder str = new StringBuilder(); var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(finalstring)).toArray(); foreach (ArrayList sentence in sentences) { var taggedSentence = tagger.tagSentence(sentence); string sent = SentenceUtils.listToString(taggedSentence, false); String[] tokens = sent.Split(' '); for (int i = 0; i < tokens.Length; i++) { if (tokens[i].Contains("/VB")) { str.Append(tokens[i] + " "); } } } return(str.ToString()); }
static string getArticleBody(string url) { try { HtmlAgilityPack.HtmlWeb web = new HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(url); Console.WriteLine(url); string text = ""; string stemmed = ""; foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p/text()")) { text += Regex.Replace(node.InnerText, @"[^\w\s]", " ").ToLower(); } text = Services.StopwordTool.RemoveStopwords(text); //Get just the list of words string[] toStemSplit = text.Split( new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); //Load the Lemmatizer for English ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); foreach (string word in toStemSplit) { //Put the word in lower case; string wordLower = word.ToLower(); //Lemmatize the word to get the stem string lemma = lmtz.Lemmatize(wordLower); //Add it to the output stemmed += lemma + " "; } //Console.WriteLine("The stemmed article\n\n\n" + stemmed); return(stemmed); } catch { return("ERROR"); } }
static public List <Word> EqualWords(string[] str) { ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Russian); var words = (from s in str.AsParallel() where s.Length > 2 group s by lmtz.Lemmatize(s.ToLower()) into d let count = d.Count() select new { num = count, word = d.GroupBy(p => p.ToLower()).Where(p => count > 1), num1 = d.GroupBy(p => p.ToLower()).Where(p => count > 1).Count() }).OrderByDescending(p => p.num); List <Word> wordsList = new List <Word>(); foreach (var y in words) { foreach (var s in y.word) { wordsList.Add(new Word(s.Key, y.num, y.num1)); } } return(wordsList); }
public LemmaSharp.LemmatizerPrebuiltCompact LemmaGenChoice(string LemmatizerDropdownSelection) { LemmaSharp.LemmatizerPrebuiltCompact Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); switch (LemmatizerDropdownSelection) { case "Беларуская (Bulgarian)": Lemmatizer = new LemmaSharp.LemmatizerPrebuiltCompact(LanguagePrebuilt.Bulgarian); break; case "čeština (Czech)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Czech); break; case "English": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); break; case "Eesti (Estonian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Estonian); break; case "فارسی (Persian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Persian); break; case "français (French)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.French); break; case "Magyar (Hungarian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Hungarian); break; case "Македонски (Macedonian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Macedonian); break; case "polski (Polish)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Polish); break; case "Română (Romanian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Romanian); break; case "Pyccĸий (Russian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); break; case "Slovenčina (Slovak)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovak); break; case "Slovene": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Slovene); break; case "Srpski / Српски (Serbian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Serbian); break; case "Українська (Ukranian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Ukrainian); break; case "EnglishMT": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.EnglishMT); break; case "françaisMT": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.FrenchMT); break; case "Deutsch (German)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.German); break; case "italiano (Italian)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Italian); break; case "Español (Spanish)": Lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Spanish); break; } return(Lemmatizer); }
/// <summary> /// Only taking tokens of at least 3 chars. /// </summary> /// <param name="text"></param> /// <param name="threeshold"></param> /// <returns></returns> private static Dictionary <string, int> Tokenize(string text, int threeshold, string language) { Dictionary <string, int> WordCount = new Dictionary <string, int>(); ILemmatizer lmtz = null; switch (language) { case "eng": lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); break; case "fra": lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French); break; } text = text.Replace("\r\n", " "); Dictionary <string, int> entities = NlpHelper.GetNamedEntititesForText(text); LogHelper.Log("entities:" + entities.Count.ToString()); string[] words = text.Split(new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { var word = words[i].ToLowerInvariant(); var LeftWord = (i > 0) ? words[i - 1].ToLowerInvariant() : string.Empty; var RightWord = (i < (words.Length - 1)) ? words[i + 1].ToLowerInvariant() : string.Empty; if (word.Length < 3) //avoid unnecessary lemmatization { continue; } string LeftBiGramKey = string.Concat(LeftWord, " ", word); string RightBiGramKey = string.Concat(word, " ", RightWord); string TriGramKey = string.Concat(LeftWord, " ", word, " ", RightWord); string NamedEntity = null; if (entities.ContainsKey(word.ToLowerInvariant())) { if (entities[word.ToLowerInvariant()] != 2) { NamedEntity = word; } } else if (entities.ContainsKey(LeftBiGramKey)) { if (entities[LeftBiGramKey] != 2) { NamedEntity = string.Concat(LeftWord, " ", word); } } else if (entities.ContainsKey(RightBiGramKey)) { if (entities[RightBiGramKey] != 2) { NamedEntity = string.Concat(word, " ", RightWord); } } else if (entities.ContainsKey(TriGramKey)) { if (entities[TriGramKey] != 2) { NamedEntity = string.Concat(LeftWord, " ", word, " ", RightWord); } } if (NamedEntity != null) { if (!WordCount.ContainsKey(NamedEntity)) { WordCount.Add(NamedEntity, 1); } else { WordCount[NamedEntity]++; } } else { string lemma = (lmtz != null) ? LemmatizeOne(lmtz, word) : word; if (lemma.Length < 3) //ignore lemma of less than 3 characters { continue; } if (!WordCount.ContainsKey(lemma)) { WordCount.Add(lemma, 1); } else { WordCount[lemma]++; } } } Dictionary <string, int> ElligibleWords = WordCount.Where( w => w.Value >= threeshold).Select(w => new { w.Key, w.Value }).ToDictionary(w => w.Key, w => w.Value); return(ElligibleWords); }
static void Main(string[] args) { //Console.WriteLine("Input link: "); //String s = Console.ReadLine(); //Console.WriteLine("Input Directory to save: "); //String d = Console.ReadLine(); String startupPath = System.IO.Directory.GetCurrentDirectory(); //startupPath = Path.Combine(startupPath, d); dsFiles = Path.Combine(startupPath, "Data");//Path to source files. //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas /* STEP 1. PARSE HTML * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser()); * * parser.SetSettings(new NekdoSettings(1,100)); * parser.OnNewData += NewData; * parser.OnComplete += Complete; * dir = new DirectoryInfo(startupPath); * try{ * dir.Create(); * } * catch(IOException){ * Console.WriteLine("This directory has already exist. Continue work with this directory"); * } * parser.Start(); * while(parser.IsActive()){//awaiting parser... * * } * * * * CreateIndexF(parser.GetUrls()); */ //STEP 2 STEMMING /* * TrainDataParser TDP = new TrainDataParser(); * * * Lemmatization(TDP); * * Console.WriteLine(""); */ //STEP 3 CREATING INDEX. String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt"); Console.WriteLine("===STEP 3 ==="); IndexBuilder builder = new IndexBuilder(); Console.WriteLine("Source: {0} ", builder.Source); Console.WriteLine("Dest: {0}", indexFileP); LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5) foreach (KeyValuePair <String, IndexEntry> p in indexer) { Double I = Math.Round(100.0 / p.Value.Ids.Count, 5); p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0); foreach (Double prob in p.Value.Probs) { p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D) } //String data = p.Key +" : "+ p.Value; //__CreateIFile(indexFileP, data);//read Data from indexer to file. } Console.WriteLine("Done."); IStemmer stem = new RussianStemmer(); //STEMMER BoolSyntaxParser bp = new BoolSyntaxParser(); //PARSER OF BOOLEAN EXPRESSIONS ILemmatizer lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER. //STEP 4. BOOLEAN SEARCH BY(indexer) /* * while(true){ * Console.WriteLine("Input search str..."); * String ui = Console.ReadLine(); * * String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries); * LinkedStack<String> ui_w = bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION * * String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY * foreach(String it2 in ui_wa){ * Console.WriteLine(it2); * } * SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa); * __FindLinks(idsOf); * * }*/ //STEP 5 Vector SEARCH BY(indexer). ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT. Console.WriteLine("VECTOR SEARCH...\n"); while (true) { PQ.Clear(); Console.WriteLine("Input search str..."); String ui = Console.ReadLine(); Double[] score = new Double[101]; //Double[] lengths = new Double[101];//ST_C Double[] lengths = builder.GetLens();//ST_UC Double q_w = 0.0; String[] u = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries); foreach (String t in u) { IndexEntry te; if (indexer.TryGetValue(lemmer.Lemmatize(t), out te)) { q_w += te.IDF * te.IDF; Int32 i = 1; foreach (Int32 id in te.Ids) { score[id] += te.Weights[i]; //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C i++; } } } q_w = Math.Sqrt(q_w); if (q_w == 0.0) { Console.WriteLine("NOT FOUND"); } else { for (Int32 k = 1; k < 101; k++) { if (lengths[k - 1] == 0) //ST_C { continue; //ST_C } //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C //score[k] = score[k]/(lengths[k]*q_w);//ST_C score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0. if (score[k] == 0.0) { continue; } PQ.Add(new HeapEntry() { Relevance = 1d / score[k], Id = k }); //ASC ORDER } SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>(); Int32 KM = 5; while (!PQ.IsEmpty() && KM > 0) { HeapEntry et = PQ.DeleteMin(); Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance); docIds.Add(et.Id); KM--; } Console.WriteLine(""); __FindLinksV(docIds); } } }
private Keywords getKeywords(string data, int count) { string paragraph = data;// "Simple computers are small enough to fit into mobile devices, and mobile computers can be powered by small batteries. Personal computers in their various forms are icons of the Information Age and are what most people think of as “computers.” However, the embedded computers found in many devices from MP3 players to fighter aircraft and from toys to industrial robots are the most numerous."; paragraph = paragraph.ToLower(); string[] words = paragraph.Split(new char[] { ' ', ',', '.', '(', ')', '[', ']', '“', '”', '"', '\n', '!' }, StringSplitOptions.RemoveEmptyEntries); string[] swords = words.Where(x => !stopWordTest(x)).ToArray(); List <string> lwords = new List <string>(); ILemmatizer lemm = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English); foreach (string word in swords) { if (word.Length == 1) { continue; } if (word.Length <= 3) { //Console.WriteLine(word); lwords.Add(word.ToLower()); } else { lwords.Add(lemm.Lemmatize(word)); } } List <string> fwords = new List <string>(); fwords = lwords.Where(x => !commonWordTest(x)).ToList(); //remove keyword // string sptr = textBox1.Text; sptr = sptr.ToLower(); // foreach (string sp in fwords) // if (sp==sptr) fwords.Remove(sp); // for (int i = 0; i < fwords.Count; i++) { if (fwords[i].Equals(sptr)) { fwords.Remove(fwords[i]); } } Dictionary <string, int> finallist = new Dictionary <string, int>(); var cwords = fwords.GroupBy(i => i); foreach (var w in cwords) { if (w.Count() > count) { finallist.Add(w.Key, w.Count()); textBox2.AppendText(w.Key + ": " + w.Count() + "\n"); Console.WriteLine("{0} {1}", w.Key, w.Count()); } } Keywords keys = new Keywords(); for (int i = 0; i < fwords.Count; i++) { if (finallist.ContainsKey(fwords[i])) { keys.addOcc(fwords[i], i); } } keys.words.Sort(sortWordsCount); return(keys); }
public Normalizer() { lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); }
private Dictionary <string, int> GetTermsDictionary(HtmlDocument doc) { Dictionary <string, int> dict = new Dictionary <string, int>(); //var paragraphs = doc.DocumentNode.SelectNodes("//div[contains(@id, 'mw-content-text')]"); //Get the contents var paragraphs = doc.DocumentNode.SelectNodes("//p"); var contents = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/p"); var orderedLists = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/ol/li"); var unorderedLists = doc.DocumentNode.SelectNodes("//div[contains(@class, 'mw-parser-output')]/ul/li"); //int count = 1; var lemmatizer = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English); if (contents != null) { foreach (var content in contents) { //Console.WriteLine(content.InnerText); string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); foreach (var term in terms) { string input = Preprocess(term, lemmatizer); //Don't count on stopwords if (stopwords.Contains(input)) { continue; } if (dict.ContainsKey(input)) { dict[input] += 1; } else if (input.Length > 14) { continue; } else if (!string.IsNullOrEmpty(input)) { dict.Add(input, 1); } } } } if (orderedLists != null) { foreach (var content in orderedLists) { //Console.WriteLine(content.InnerText); string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); foreach (var term in terms) { string input = Preprocess(term, lemmatizer); //Don't count on stopwords if (stopwords.Contains(input)) { continue; } if (dict.ContainsKey(input)) { dict[input] += 1; } else if (input.Length > 14) { continue; } else if (!string.IsNullOrEmpty(input)) { dict.Add(input, 1); } } } } if (unorderedLists != null) { foreach (var content in unorderedLists) { //Console.WriteLine(content.InnerText); string[] terms = content.InnerText.Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); foreach (var term in terms) { string input = Preprocess(term, lemmatizer); //Don't count on stopwords if (stopwords.Contains(input)) { continue; } if (dict.ContainsKey(input)) { dict[input] += 1; } else if (input.Length > 14) { continue; } else if (!string.IsNullOrEmpty(input)) { dict.Add(input, 1); } } } } return(dict); }