public void StemTest() { Stemmer stemmer = new Stemmer(); string input, expected, actual; input = "کتابی"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابها"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابهایی"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابهایشان"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "اندیشهاش"; expected = "اندیشه"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); }
public Dictionary <string, int> GetTopNWordsDictionary(int N) { string[] ignoreWords = { "*" }; Dictionary <string, int> wordCount = new Dictionary <string, int>(); StringBuilder sbFullText = new StringBuilder(); foreach (children child in this.children) { sbFullText.Append(child.SubtreeText); sbFullText.Append(" "); } string[] allWords = GetAllWords(sbFullText.ToString()); wordCount = new Dictionary <string, int>(); Dictionary <string, string> stemParent = new Dictionary <string, string>(); foreach (string word in allWords) { try { string stemmed = Stemmer.GetStem(word); if (stemParent.ContainsKey(stemmed)) { if (stemParent[stemmed].Length < word.Length) { stemParent[stemmed] = word; } } else { stemParent[stemmed] = word; } if (stopWords.Contains(stemmed.ToLower())) { continue; } if (!wordCount.ContainsKey(stemmed) && !ignoreWords.Contains(stemmed)) { wordCount[stemmed] = 1; } else { wordCount[stemmed] += 1; } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } wordCount = wordCount.OrderByDescending(x => x.Value).Take(N).ToDictionary(kvp => stemParent[kvp.Key], kvp => kvp.Value); return(wordCount); }
/// <summary> /// Возвращает список проллемитизированных слов из запроса (если не может найти начальную форму то стреммит его т.е. обрезает окончание) /// </summary> /// <param name="request">список слов запроса</param> /// <returns> обработанный список слов из запроса</returns> public static List <string> GetStremmingWords(string[] request) { var result = new List <string>(); foreach (var word in request) { var infWord = Analyser.FindAllSourceForm(word).FirstOrDefault(); result.Add(infWord.SourceForm ?? Stemmer.Stemm(word)); } return(result); }
private void GenerateInlines() { string[] searchWords = SearchIndex.StemWords(SearchIndex.GetWords(this.SearchText)); if (searchWords.Length == 0) { return; } string[] inputWords = SearchIndex.GetWords(this.InputText); if (inputWords.Length == 0) { return; } List <string> highlightWords = new List <string>(); Stemmer stemmer = new Stemmer(); foreach (string word in inputWords) { if (Enumerable.Contains <string>(searchWords, stemmer.Stem(word))) { highlightWords.Add(word); } } string text = this.InputText; Regex regex = GetRegexFromWordList(highlightWords.ToArray()); int index = 0; if (regex != null) { MatchCollection matches = regex.Matches(text); foreach (Match match in matches) { if (match.Index > index) { this.Inlines.Add(new Run(text.Substring(index, match.Index - index))); } string searchWord = text.Substring(match.Index, match.Length); this.Inlines.Add(new Bold(new Run(searchWord))); index = match.Index + match.Length; } } if (index < text.Length) { this.Inlines.Add(new Run(text.Substring(index, text.Length - index))); } Assert.IsTrue(this.Inlines.Count != 0); }
/// <summary> /// Леммитизация слов из непосредственного запроса пользователя в векторном поиске /// </summary> /// <param name="request"></param> /// <returns></returns> public static string[] GetStremmingWordsForQuery(string[] request) { var result = new string[request.Length]; for (int i = 0; i < request.Length; i++) { var infWord = Analyser.FindAllSourceForm(request[i]).FirstOrDefault(); result[i] = (infWord.SourceForm ?? Stemmer.Stemm(request[i])); } return(result); }
public string StemText(string text) { string result = ""; TokenStream stream = Stemmer.TokenStream(String.Empty, new StringReader(text)); while (stream.IncrementToken()) { TermAttribute termAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); result = result + termAttr.Term() + " "; } return(result.Trim()); }
public int GetTextForMark(string text) { Stemmer stemmer = new Stemmer(); List <string> stemmedWords = new List <string>(); var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Count(); i++) { words[i] = stemmer.Stem(words[i]); } stemmedWords = words.ToList <string>(); StreamReader reader = new StreamReader("correlations.txt"); Dictionary <string, double> correlationTable = new Dictionary <string, double>(); string line; while ((line = reader.ReadLine()) != null) { double correlation; string[] splittedLine = line.Split(' '); if (splittedLine.Count() == 2) { if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "") { correlationTable.Add(splittedLine[0], correlation); } } } double logOfP = 0; foreach (var word in stemmedWords) { if (correlationTable.ContainsKey(word)) { logOfP += correlationTable[word]; } } double pFraction = Math.Exp(logOfP); double result = pFraction / (pFraction + 1); if (result > 0.5) { return(1); } return(0); }
private static Hashtable genStopwordTable(string path) { Hashtable stopwordTable = new Hashtable(); StreamReader stopFile = new StreamReader(path); string line; string word; Stemmer stemmer = new Stemmer(); while ((line = stopFile.ReadLine()) != null) { stemmer.add(line.Trim().ToCharArray(), line.Length); stemmer.stem(); word = stemmer.ToString(); stopwordTable[word.ToLower()] = 1; } stopFile.Close(); return(stopwordTable); }
public int GetTextForMark(string text) { Stemmer stemmer = new Stemmer(); List<string> stemmedWords = new List<string>(); var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); for(int i = 0; i< words.Count();i++) { words[i] = stemmer.Stem(words[i]); } stemmedWords = words.ToList<string>(); StreamReader reader = new StreamReader("correlations.txt"); Dictionary<string,double> correlationTable = new Dictionary<string,double>(); string line; while((line = reader.ReadLine()) !=null) { double correlation; string[] splittedLine = line.Split(' '); if (splittedLine.Count() == 2) { if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "") { correlationTable.Add(splittedLine[0], correlation); } } } double logOfP = 0; foreach (var word in stemmedWords) { if (correlationTable.ContainsKey(word)) { logOfP += correlationTable[word]; } } double pFraction = Math.Exp(logOfP) ; double result = pFraction/(pFraction + 1); if (result > 0.5) { return 1; } return 0; }
/// <summary> /// /// </summary> /// <param name="noStopwordsBugReportList"></param> /// <returns></returns> private List <string> ApplyStemming(List <string> noStopwordsBugReportList) { List <string> stemmedList = new List <string>(); foreach (var item in noStopwordsBugReportList) { string[] words = item.Split(' '); string finalStemOutput = ""; foreach (string word in words) { Stemmer temp = new Stemmer(); temp.add(word.ToCharArray(), word.Length); temp.stem(); var stemOutput = temp.ToString(); finalStemOutput += stemOutput + " "; } stemmedList.Add(finalStemOutput); } return(stemmedList); }
/// <summary> /// Processes the specified text. /// </summary> /// <param name="text">The text.</param> /// <returns>The resulting document object.</returns> public Document Process(string text) { var TempText = NormalizerManager.Normalize(text); var Tokens = Tokenizer.Tokenize(TempText, TokenizerLanguage); Tokens = NormalizerManager.Normalize(Tokens); Tokens = Stemmer.Stem(Tokens, StemmerLanguage); Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage); var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); for (int x = 0; x < Sentences.Length; ++x) { var Sentence = Sentences[x]; Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage); } Tokens = EntityFinder.Find(Tokens, EntityFinderType); Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage)); }
Dictionary <string, HashSet <int> > GetWordIDMapping(children child) { Dictionary <string, HashSet <int> > wordIDMapping = new Dictionary <string, HashSet <int> >(); string[] allWords = GetAllWords(child.text); foreach (string word in allWords) { if (stopWords.Contains(word.ToLower())) { continue; } if (word.Length < 3 && word.Any(c => char.IsLower(c))) { continue; } string stem = Stemmer.GetStem(word); if (!wordIDMapping.ContainsKey(stem)) { wordIDMapping[stem] = new HashSet <int>(); } wordIDMapping[stem].Add(child.id); } foreach (children childitem in child.Children) { Dictionary <string, HashSet <int> > mapping = GetWordIDMapping(childitem); foreach (var kvp in mapping) { if (wordIDMapping.ContainsKey(kvp.Key)) { wordIDMapping[kvp.Key].UnionWith(kvp.Value); } else { wordIDMapping[kvp.Key] = kvp.Value; } } } return(wordIDMapping); }
static double pmi(string ti, string tj, Dictionary <string, double> prob) { ti = ti.ToLower(); Stemmer s = new Stemmer(); char[] arr = ti.ToCharArray(); s.add(arr, ti.Length); s.stem(); ti = s.ToString(); tj = tj.ToLower(); Stemmer s2 = new Stemmer(); char[] arr2 = tj.ToCharArray(); s2.add(arr2, tj.Length); s2.stem(); tj = s2.ToString(); double pmi = 0; if (!prob.ContainsKey(ti + " " + tj) && !prob.ContainsKey(tj + " " + ti)) { return(0); } if (prob.ContainsKey(ti + " " + tj)) { pmi += prob[ti + " " + tj]; } if (prob.ContainsKey(tj + " " + ti)) { pmi += prob[tj + " " + ti]; } if (!prob.ContainsKey(ti) || !prob.ContainsKey(tj)) { return(0); } pmi /= (prob[ti] * prob[tj]); pmi = Math.Log(pmi, 2); return(pmi); }
public CheakSpell() { try { _globalDic = new List <string>(); _userDic = new List <string>(); _ignoreList = new List <string>(); _stopWordList = new List <string>(); _ignoreCharList = new List <char>(); var persianWordFrequencyOpration = new PS_PersianWordFrequencyOpration(); //load from DB var listParsianWordfreq = persianWordFrequencyOpration.GetAll(); _sundex = new Soundex(listParsianWordfreq.Where(x => x.Sundex.Length > 0).ToList()); _norvan = new NorvigSpellChecker(listParsianWordfreq); _stemmr = new Stemmer(listParsianWordfreq); foreach (var item in listParsianWordfreq) { _globalDic.Add(item.Val1.Trim()); } var lsStop = new PS_StopWordOpration(); foreach (var item in lsStop.GetAll()) { _stopWordList.Add(item.Val1.Trim()); } var userDicOpration = new UserDicOpration(); _userDic = userDicOpration.LoadAll(); } catch (Exception) { // ignored } }
/// <summary> /// Method to filter input text. /// </summary> /// <param name="text"></param> /// <returns></returns> private string FilterText(string text) { var currDir = System.Environment.GetFolderPath(System.Environment.SpecialFolder.ApplicationData); // Combine the base folder with your specific folder.... string specificFolder = System.IO.Path.Combine(currDir, "MARC 3.0"); // Check if folder exists and if not, create it if (!Directory.Exists(specificFolder)) { Directory.CreateDirectory(specificFolder); } text.Replace('.', ' '); if (NoSWCheckboxCheckedState) { StopWordRemoval.StopWordRemoval temp = new StopWordRemoval.StopWordRemoval(text, specificFolder); text = temp.output; } if (STCheckboxCheckedState) { string[] words = text.Split(' '); string finalStemOutput = ""; foreach (string word in words) { Stemmer temp = new Stemmer(); temp.add(word.ToCharArray(), word.Length); temp.stem(); var stemOutput = temp.ToString(); finalStemOutput += stemOutput + " "; } text = finalStemOutput; } text = RemoveSpecialCharacters(text); return(text); }
private static string getWord(string word, Hashtable stopwordTable) { Stemmer stemmer = new Stemmer(); string result = word.ToLower().Trim(new Char[] { '_', '-', '.' }); double Num; bool isNum = double.TryParse(word, out Num); if (isNum) { return(null); } stemmer.add(result.ToCharArray(), result.Length); stemmer.stem(); result = stemmer.ToString(); if (result.Length == 0) { return(null); } if (stopwordTable.ContainsKey(result)) { return(null); } return(result); }
/// <summary> /// This method retrive a single query results /// </summary> /// <param name="query"></param> /// <param name="language"></param> /// <param name="queryId"></param> public void retriveSingleQuery(string query, string language, int queryId) { Stemmer stemmer = new Stemmer(); string[] parseQuery = searcher.ParseQuery(query); List <string> semanticQuery = searcher.AddSemantic(parseQuery.ToList()); List <string> queryList = parseQuery.ToList(); Dictionary <string, Dictionary <string, int> > QueryTermsOccurrences = new Dictionary <string, Dictionary <string, int> >(); Dictionary <string, Dictionary <string, int> > SemanticQuery = new Dictionary <string, Dictionary <string, int> >(); if (Properties.Settings.Default.stemmer) { for (int i = 0; i < queryList.Count; i++) { queryList[i] = stemmer.stemTerm(queryList[i]); } } QueryTermsOccurrences = searcher.AllQueryOccurrences(queryList.ToArray(), language); SemanticQuery = searcher.AllQueryOccurrences(semanticQuery.ToArray(), language); //List<string> cluster = searcher.index.buildCarrot2(parseQuery, QueryPerformances); ConcurrentDictionary <string, double> ranking = ranker.CalculateTotalRank(queryList.ToArray(), semanticQuery, QueryTermsOccurrences, SemanticQuery); QueriesResults[queryId] = ranker.sortRanking(ranking); }
static List <int> positions(string word, string docName) { word = word.ToLower(); Stemmer s = new Stemmer(); char[] arr = word.ToCharArray(); s.add(arr, word.Length); s.stem(); word = s.ToString(); List <int> pos = new List <int>(); using (StreamReader sr = new StreamReader("Web_Documents/" + docName)) { string line; int count = 0; char[] delim = { '.', ',', ';', ':', '-', '!', '?', '"', '\'', '`', '(', ')', '[', ']', '{', '}', ' ', '\t' }; while ((line = sr.ReadLine()) != null) { //line = line.Replace(".", " "); //line = line.Replace(",", " "); //line = line.Replace(";", " "); //line = line.Replace(":", " "); //line = line.Replace("-", " "); //line = line.Replace("!", " "); //line = line.Replace("?", " "); //line = line.Replace("\"", ""); //line = line.Replace("'", " "); //line = line.Replace("`", " "); //line = line.Replace("(", ""); //line = line.Replace(")", ""); //line = line.Replace("[", ""); //line = line.Replace("]", ""); //line = line.Replace("{", ""); //line = line.Replace("}", ""); string[] words = line.Split(delim, StringSplitOptions.RemoveEmptyEntries); //for (int i = 0; i < words.Length; i++) //{ // Console.Write(words[i] + " "); //} //Console.WriteLine(); for (int i = 0; i < words.Length; i++) { words[i] = words[i].ToLower(); Stemmer s2 = new Stemmer(); char[] arr2 = words[i].ToCharArray(); s2.add(arr2, words[i].Length); s2.stem(); words[i] = s2.ToString(); count++; if (words[i].Equals(word)) { pos.Add(count); //Console.Write(count); } } //Console.WriteLine(); //Console.Read(); } } return(pos); }
private List <string> SplitsTheParagraphInWords(string text, bool isPositive) { List <string> WordsList = new List <string>(); Stemmer stemmer = new Stemmer(); text = text.Trim().ToLower(); if (text[text.Length - 1] != '.') { text += "."; } text = text.Replace(',', ' '); text = text.Replace(';', ' '); text = text.Replace(':', ' '); text = text.Replace('\"', ' '); text = text.Replace('\'', ' '); text = text.Replace('!', ' '); text = text.Replace('?', ' '); text = text.Replace('(', ' '); text = text.Replace(')', ' '); text = text.Replace(']', ' '); text = text.Replace('[', ' '); text = text.Replace('<', ' '); text = text.Replace('>', ' '); text = text.Replace('+', ' '); text = text.Replace('*', ' '); text = text.Replace('%', ' '); text = text.Replace('&', ' '); text = text.Replace('$', ' '); text = text.Replace('=', ' '); text = text.Replace('^', ' '); text = text.Replace('-', ' '); text = text.Replace('/', ' '); text = text.Replace('\\', ' '); text = text.Replace('\'', ' '); text = text.Replace('@', ' '); text = text.Replace('_', ' '); string[] Sentences = text.Split('.'); for (int i = 0; i < Sentences.Length; i++) { if (!string.IsNullOrEmpty(Sentences[i])) { string[] Words = Sentences[i].Trim().Split(' '); for (int j = 0; j < Words.Length; j++) { if (!IsWordContractionOrStop(Words[j]) && !string.IsNullOrEmpty(Words[j]) && Words[j].Length > 3 && Words[j].Length < 13) { Words[j] = StemTheWord(Words[j], stemmer); if (!IsStemmedWordPartOfList(Words[j], isPositive)) { if (!WordsList.Contains(Words[j])) { if (Words[j].Length > 2) { WordsList.Add(Words[j]); if (isPositive) { PositiveTimes.Add(1); NegativeTimes.Add(0); } else { PositiveTimes.Add(0); NegativeTimes.Add(1); } } } else { try { int indexOfWord = WordsList.IndexOf(Words[j]); if (isPositive) { PositiveTimes[PositiveTimes.Count - WordsList.Count + indexOfWord]++; } else { NegativeTimes[NegativeTimes.Count - WordsList.Count + indexOfWord]++; } } catch { Debug.Log("Error Index \"SplitsTheParagraphInWords\" Method"); } } } } } } } return(WordsList); }
public unsafe (WriteableIndex index, int files, int docs, long size) IndexAllParallel(IndexOptions options, string folder) { var timer = Stopwatch.StartNew(); var files = Directory.GetFiles(folder, "*", SearchOption.AllDirectories); Console.WriteLine("Found files: " + files.Length + " - took: " + timer.ElapsedMilliseconds + "ms"); timer.Restart(); var tasks = new List <Task <WriteableIndex> >(); var parallel = Environment.ProcessorCount; var docsCount = 0; var fileCount = 0; var sizeSum = 0L; var sizeLocal = 0L; for (var p = 0; p < parallel; p++) { var taskNumber = p; tasks.Add(Task.Run(() => { var localIndex = new WriteableIndex(options); var localStemmer = new Stemmer(); var localParser = new Parser(); var localPart = (files.Length / parallel); var from = taskNumber * localPart; var to = taskNumber == parallel - 1 ? files.Length : from + localPart; for (var i = from; i < to; i++) { using (var mmf = MemoryMappedFile.CreateFromFile(files[i], FileMode.Open)) using (var accessor = mmf.CreateViewAccessor()) { byte *buffer = null; accessor.SafeMemoryMappedViewHandle.AcquirePointer(ref buffer); var len = new FileInfo(files[i]).Length; var docs = localParser.ParseFileFast(buffer, len); Interlocked.Add(ref docsCount, docs.Count); Interlocked.Add(ref sizeSum, len); Interlocked.Add(ref sizeLocal, len); IndexDocuments(localIndex, localStemmer, buffer, docs); accessor.SafeMemoryMappedViewHandle.ReleasePointer(); } if (Interlocked.Increment(ref fileCount) % 100 == 0) { Console.WriteLine( "Finished: " + fileCount + " - " + Math.Round((Interlocked.Read(ref sizeLocal) * 0.000001d)) + " mb - " + +Math.Round((Interlocked.Read(ref sizeLocal) * 0.000001d) / (timer.ElapsedMilliseconds / 1000d), 2) + " mb/s"); timer.Restart(); Interlocked.Exchange(ref sizeLocal, 0); } } Console.WriteLine("task finished: " + taskNumber); return(localIndex); } )); } var all = Task.WhenAll(tasks).Result; Console.WriteLine("Index building completed. Merging indices ..."); var master = all[0]; var mergeTime = Stopwatch.StartNew(); for (var i = 1; i < all.Length; i++) { master.Merge(all[i]); } mergeTime.Stop(); Console.WriteLine("Merge complete after: " + mergeTime.ElapsedMilliseconds + " ms"); timer.Stop(); return(master, fileCount, docsCount, sizeSum); }
/// <summary> /// This is the main method of the pre query engine. This is method processes all documents in the corpus and building the index. /// </summary> public void engine() { stopWatch.Start(); string[] files = rf.getCorpusFilesFromSource(); ConcurrentBag <string> languagesConcurrentBag = new ConcurrentBag <string>(); //Thread t = new Thread(() => indexer.mergeQueueFirstThread()); //t.Start(); foreach (string filePath in files) { if (filePath.EndsWith("stop_words.txt")) { continue; } string[] docs = rf.seperateDocumentsFromFile(filePath); ConcurrentDictionary <string, Dictionary <string, int> > ContinuTermsFileDic = new ConcurrentDictionary <string, Dictionary <string, int> >(); ConcurrentDictionary <string, string> tempFileDictionary = new ConcurrentDictionary <string, string>(); Parallel.ForEach(docs, new ParallelOptions { MaxDegreeOfParallelism = 1 }, (doc) => { Stemmer stemmer = new Stemmer(); Dictionary <string, int> uniqeTermsAtDoc = new Dictionary <string, int>(); string metaData; string text; rf.getMetaDataAndTextFromDoc(doc, out metaData, out text); string docNo = indexer.AddDocFromMetaData(metaData); //if (docNo.Equals("FBIS4-11824")) //{ // string s = filePath.ToString(); //} languagesConcurrentBag.Add(indexer.documentDictionary[docNo].originalLanguage); string[] stringSeparators = new string[] { " ", "\n", "...", "--", "?", ")", "(", "[", "]", "\"", "&", "_", ";", "~", "|" }; string[] textArray = text.ToLower().Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < textArray.Length; i++) { textArray[i] = parser.cutAllsigns(textArray[i]); } if (textArray.Length == 0) { return; } List <string> textList = textArray.ToList(); textList.Add(""); textList.Add(""); textList.Add(""); textList.Add(""); string lastParsTerm = ""; for (int i = 0; i < textList.Count - 4; i++) { string parsedTerm1; string parsedTerm2; if (parser.checkForStopWord(textList[i]) == 1 && !textList[i].Equals("between")) { continue; } else { int jump = parser.parseTerm(ref textList, i, out parsedTerm1, out parsedTerm2); if (jump >= 0) { i += jump; //stemmer if (Properties.Settings.Default.stemmer) { parsedTerm1 = stemmer.stemTerm(parsedTerm1); } AddTermUniqe(parsedTerm1, uniqeTermsAtDoc); if (i > 0) { AddAutoCompletion(lastParsTerm, parsedTerm1, ContinuTermsFileDic); } lastParsTerm = parsedTerm1; if (parsedTerm2 != null) { if (Properties.Settings.Default.stemmer) { parsedTerm2 = stemmer.stemTerm(parsedTerm2); } AddTermUniqe(parsedTerm2, uniqeTermsAtDoc); // lastParsTerm = parsedTerm2; } } else { if (parsedTerm1 != null && !textList[i].Equals("between") && !parsedTerm1.Equals("")) { if (Properties.Settings.Default.stemmer) { parsedTerm1 = stemmer.stemTerm(parsedTerm1); } AddTermUniqe(parsedTerm1, uniqeTermsAtDoc); if (i > 0) { AddAutoCompletion(lastParsTerm, parsedTerm1, ContinuTermsFileDic); } lastParsTerm = parsedTerm1; } } } } indexer.AddToMetaData(uniqeTermsAtDoc, docNo); CalWij(uniqeTermsAtDoc, docNo); indexer.addUniqueDicToTempDic(ref tempFileDictionary, uniqeTermsAtDoc, docNo); indexer.addUniqueDicToMainDic(uniqeTermsAtDoc); }); indexer.addFileDicToDisk(tempFileDictionary); AddCompletionDicToMain(ContinuTermsFileDic); } indexer.stop = false; //t.Join(); indexer.mergeQueue(); indexer.updateTermPointers(); indexer.saveTermDictionary(); indexer.saveDocumentDictionary(); stopWatch.Stop(); LanguagesList = new List <string>(languagesConcurrentBag.Distinct()); WriteLanguagesToDisk(languagesList); int sum = indexer.countNumbers(); System.Windows.MessageBox.Show("Inverted index is complete. \nNumber of terms: " + indexer.mainTermDictionary.Count() + ".\nNumber of documents: " + indexer.documentDictionary.Count() + "\nRun time: " + stopWatch.ElapsedMilliseconds / 1000); }
public Dictionary <string, List <CommentObj> > GetNamedObjects(int N) { StringBuilder sbAllWords = new StringBuilder(); foreach (children child in children) { sbAllWords.Append(child.SubtreeText); sbAllWords.Append(" "); } string[] allWords = GetAllWords(sbAllWords.ToString()); Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords); List <string> namedObjects = new List <string>(); children rootNode = new children(); List <HashSet <int> > rootChildIDs = new List <HashSet <int> >(); foreach (children child in children) { GetChildIDHashSetList(child); HashSet <int> currChildIDs = new HashSet <int>(); currChildIDs.Add(child.id); foreach (var item in child.ChildIDList) { currChildIDs.UnionWith(item); } rootChildIDs.Add(currChildIDs); } rootNode.ChildIDList = rootChildIDs; NodeList = new List <children>(); NodeList.Add(rootNode); foreach (children child in children) { PopulateNodeList(child); } Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(); //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>(); Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >(); foreach (var kvp in wordIDMapping) { List <children> currLCAList = new List <children>(); int numLCAs = 0; foreach (children node in NodeList) { int numBranchesWithWord = 0; foreach (var childIDBranch in node.ChildIDList) { if (childIDBranch.Intersect(kvp.Value).Count() > 0) { numBranchesWithWord += 1; } } if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1) { currLCAList.Add(node); } } WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList; } namedObjects = WordLCAList .OrderByDescending(x => x.Value.Count) .Select(x => x.Key) .Where(y => CommonWords.GetFrequency(y) < 1) .Where(a => char.IsUpper(a[0])) .Where(b => b.Length > 1) .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s"))) .Take(N) .ToList(); //namedObjects.Sort(); Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >(); foreach (string namedObject in namedObjects) { List <CommentObj> commentObjsForWord = new List <CommentObj>(); string stem = Stemmer.GetStem(namedObject); HashSet <int> idsWithWord = wordIDMapping[stem]; foreach (int id in idsWithWord) { children child = GetNodeById(id); CommentObj commentObj = new CommentObj() { Id = id, Text = child.text }; commentObjsForWord.Add(commentObj); } namedObjectDictionary[namedObject] = commentObjsForWord; } var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]); return(ordered); }
/* * This method sentence-tokenizes all top level comments * The best sentences are those where the words in the sentence * occur in the most number of subtree items within the current * top level comment */ public List <SentenceObj> GetTopSentences(int N) { List <SentenceObj> topSentenceObjs = new List <SentenceObj>(); List <string> topSentences = new List <string>(); Dictionary <string, double> sentenceScores = new Dictionary <string, double>(); Dictionary <string, string> sentenceAuthors = new Dictionary <string, string>(); Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>(); Dictionary <string, int> sentenceIds = new Dictionary <string, int>(); foreach (children child in children) { try { Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child); string text = child.text; List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text)); string bestSentence = currSentences[0]; double currMax = double.MinValue; foreach (string sentence in currSentences) { string[] allWords = GetAllWords(sentence); bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2); if (goodSentence) { double weightedScore = 0; int totalIDCount = 0; foreach (string word in allWords) { if (!stopWords.Contains(word.ToLower())) { string stemmedWord = Stemmer.GetStem(word); if (wordIDMapping.ContainsKey(stemmedWord)) { HashSet <int> idsContainingWord = wordIDMapping[stemmedWord]; totalIDCount += idsContainingWord.Count; weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1); } } } //add some weighting so that longer sentences have more weight weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length)))); double avgScore = weightedScore / allWords.Length; if (avgScore > currMax) { currMax = avgScore; bestSentence = sentence; } } } sentenceScores[bestSentence] = currMax; sentenceAuthors[bestSentence] = child.author; sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child)); sentenceIds[bestSentence] = child.id; } catch (Exception ex) { } } topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList(); foreach (var sent in topSentences) { SentenceObj sentenceObj = new SentenceObj() { Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id }; topSentenceObjs.Add(sentenceObj); } topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList(); return(topSentenceObjs); }
/// <summary> /// Класс для проведения разбиения слов на группы /// </summary> /// <param name="listOfWords">Перечень анализируемых слов</param> /// <param name="stemmerToUse">Используемый стеммер(в зависимости от языка)</param> /// <param name="detectAcuracy">Точность соотвествия между выделенными основами</param> public WordAnalizer(List<string> listOfWords, Stemmer.iStemmer stemmerToUse, double detectAcuracy) { InputList = listOfWords; Stemmer = stemmerToUse; Accuracy = detectAcuracy; }
private static void GenerateXmlFromFile(string fileName) { Stemmer stemmer = new Stemmer(); var htmlDoc = new HtmlDocument(); htmlDoc.Load(fileName, Encoding.UTF8); var rootNode = htmlDoc.DocumentNode; var marksText = rootNode.SelectNodes("//span[@class='grade-label']"); List<int> grades = new List<int>(); // Get marks list if (marksText != null) { foreach (var mark in marksText) { switch (mark.InnerText) { case "отличная модель": grades.Add(5); break; case "хорошая модель": grades.Add(4); break; case "обычная модель": grades.Add(3); break; case "плохая модель": grades.Add(2); break; case "ужасная модель": grades.Add(1); break; default: break; } } //Get texts for marks List<string> advantages = new List<string>(); List<string> disadvantages = new List<string>(); List<string> comments = new List<string>(); var texts = rootNode.SelectNodes("//div[@class='data']"); foreach (var text in texts) { if (text.ChildNodes[2].Name == "div") { //Достоинства advantages.Add(text.ChildNodes[3].InnerText); //Недостатки if (text.ChildNodes.Count == 5) { disadvantages.Add(text.ChildNodes[4].InnerText); } //Комментарий if (text.ChildNodes.Count == 6) { comments.Add(text.ChildNodes[5].InnerText); } } else { //Достоинства advantages.Add(text.ChildNodes[2].InnerText); //Недостатки if (text.ChildNodes.Count == 4) { disadvantages.Add(text.ChildNodes[3].InnerText); } //Комментарий if (text.ChildNodes.Count == 5) { comments.Add(text.ChildNodes[4].InnerText); } } } //Generating XML for (int i = 0; i < advantages.Count; i++) { var xml = new XmlDocument(); var xmlNode = xml.CreateNode(XmlNodeType.XmlDeclaration, "", ""); xml.AppendChild(xmlNode); var xmlElem = xml.CreateElement("", "review", ""); xml.AppendChild(xmlElem); char[] delimiterChars = { ' ', ',', '.', ':', '\t' }; if (advantages.Count > i) { string result_advantages = String.Empty; string[] advantages_split = advantages[i].Split(delimiterChars); foreach (var word in advantages_split) { result_advantages = String.Concat(result_advantages," ",stemmer.Stem(word)); } var xmlAdvantages = xml.CreateElement("", "advantages", ""); var xmlAdvatagesText = xml.CreateTextNode(result_advantages); xmlAdvantages.AppendChild(xmlAdvatagesText); xml.LastChild.AppendChild(xmlAdvantages); } if (disadvantages.Count > i) { string result_disadvantages = String.Empty; string[] disadvantages_split = disadvantages[i].Split(delimiterChars); foreach (var word in disadvantages_split) { result_disadvantages = String.Concat(result_disadvantages, " ", stemmer.Stem(word)); } var xmlDisadvantages = xml.CreateElement("", "disadvantages", ""); var xmlDisadvantagesText = xml.CreateTextNode(stemmer.Stem(result_disadvantages)); xmlDisadvantages.AppendChild(xmlDisadvantagesText); xml.LastChild.AppendChild(xmlDisadvantages); } if (comments.Count > i) { string result_comments = String.Empty; string[] comments_split = comments[i].Split(delimiterChars); foreach (var word in comments_split) { result_comments = String.Concat(result_comments, " ", stemmer.Stem(word)); } var xmlComments = xml.CreateElement("", "comments", ""); var xmlCommentsText = xml.CreateTextNode(result_comments); xmlComments.AppendChild(xmlCommentsText); xml.LastChild.AppendChild(xmlComments); } if (grades.Count > i) { var xmlGrade = xml.CreateElement("", "grade", ""); var xmlGradeText = xml.CreateTextNode(grades[i].ToString()); xmlGrade.AppendChild(xmlGradeText); xml.LastChild.AppendChild(xmlGrade); } //generate path! string path = String.Concat("c:\\xml\\", "xml_", Path.GetFileName(fileName), i.ToString(), ".xml"); xml.Save(path); } } }
public string GetSuggestion(string sPNameForGetCandidates, Dictionary <int, string> deHyphenateTokens, int key, string value, out string log, bool withStemAndAffixCorr, bool withSearch) { log = ""; string error = value; string suggestion; int minSameBigramAmount = getMinSameBigramAmount(error.Length); int minLengthVariant = 1; // batasi error Addition/Deletion =1 karakter. int maxLevensthein = 2; int minCandidates = 10; int maxCandidates = 10; string candidatesLog = ""; List <CorrectionCandidate> candidates = new List <CorrectionCandidate>(); candidates = GetCandidates(sPNameForGetCandidates, key, error, minSameBigramAmount, minLengthVariant, maxLevensthein, out candidatesLog); if (candidates.Count < minCandidates) { // bila tidak ada juga kurangi minSameBigramAmount : if (minSameBigramAmount - 1 > 0) { candidates = GetCandidates(sPNameForGetCandidates, key, error, minSameBigramAmount - 1, minLengthVariant, maxLevensthein, out candidatesLog); if (candidates.Count < minCandidates && withStemAndAffixCorr) { // bila tidak ada juga ambil dari stem : Stemmer stemmer = new Stemmer(); string prefix; string suffix; string errorRoot = stemmer.StemmingWithoutChecking(error, out prefix, out suffix); if (error != errorRoot && errorRoot.Length >= 3) { minSameBigramAmount = getMinSameBigramAmount(errorRoot.Length); if (minSameBigramAmount > 0) { var temp1 = GetCandidates(sPNameForGetCandidates, key, error, errorRoot, prefix, suffix, minSameBigramAmount, minLengthVariant, maxLevensthein, out candidatesLog); candidates.AddRange(temp1.Where(x => candidates.FirstOrDefault(y => y.Candidate == x.Candidate) == null).ToList()); } } // bila tidak ada juga coba koreksi afiksnya : string candidatesFromAffixCorrLog; var temp2 = GetCandidatesFromAffixCorrection(key, error, out candidatesFromAffixCorrLog); candidates.AddRange(temp2.Where(x => candidates.FirstOrDefault(y => y.Candidate == x.Candidate) == null).ToList()); candidatesLog += candidatesFromAffixCorrLog; } } } if (candidates.Count == 0) { log = "No candidates"; return(value); // Asumsi kalau itu kata yang benar dan tidak tercover di kamus } else if (candidates.Count == 1) { log += candidatesLog; return(candidates[0].Candidate); } else { var candidatesEqError = candidates.Where(o => o.Candidate.Equals(error, StringComparison.OrdinalIgnoreCase)).ToList(); if (candidatesEqError.Count > 0) { log = "candidate=error"; return(candidatesEqError[0].Candidate); // bila ada kandidat yg sama persis dg error maka jadikan itu saran; } if (candidates.Count > maxCandidates) { //batasi jumlah kandidat by smallest levenstein then by Frequency: var temp = candidates.OrderBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList(); candidates = new List <CorrectionCandidate>(); //reset for (int i = 0; i < maxCandidates; i++) { candidates.Add(temp[i]); } } string logsearch; List <CorrectionCandidate> SortedCandidates = new List <CorrectionCandidate>(); if (withSearch) { // for each candidate search using Trigram: foreach (CorrectionCandidate candidate in candidates) { candidate.NGram = 3; candidate.Hits = GetTrigramSearchHits(deHyphenateTokens, candidate, false, out logsearch); log += string.Format("[{0}:{1},{2}]", candidate.Candidate, logsearch, candidate.Hits); } // if no one has hits > 0 then search bigram for each candidate: if (candidates.Where(p => p.Hits > 0).Count() == 0) { foreach (CorrectionCandidate candidate in candidates) { // Search using Bigram: candidate.NGram = 2; candidate.Hits = GetBigramSearchHits(deHyphenateTokens, candidate, false, out logsearch); log += string.Format("[{0}:{1},{2}]", candidate.Candidate, logsearch, candidate.Hits); } SortedCandidates = candidates.OrderByDescending(o => o.Hits).ThenBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList(); } else // if trigram search results ones that has hits > 0 then sorting by largest hits, then smallest levenstein: { SortedCandidates = candidates.OrderByDescending(o => o.Hits).ThenBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList(); } } else { SortedCandidates = candidates.OrderBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList(); } // FINAL SUGGESTION: suggestion = SortedCandidates[0].Candidate; log = candidatesLog + log; return(suggestion); } }
private void btCorrectIt_Click(object sender, EventArgs e) { if (cbMethod.SelectedItem.ToString() == "-- Choose method --") { MessageBox.Show("Choose method first", "", MessageBoxButtons.OK); return; } if (!File.Exists(txOCR.Text.Trim())) { MessageBox.Show("Browse file first", "", MessageBoxButtons.OK); return; } ResetControls(false); articleFile = txOCR.Text.Trim(); Correction correction = new Correction(); Stemmer stemmer = new Stemmer(); // DeHyphenate and clean text: string dehyphenatedText = correction.DeHyphenate(articleFile); rtbOCR.Text = dehyphenatedText; // for analysis: string dehyphenatedTextGT = ""; if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt")) { articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"; } articleFileName = Path.GetFileName(articleFile); if (!string.IsNullOrEmpty(articleFileGT)) { dehyphenatedTextGT = correction.DeHyphenate(articleFileGT); } // tokenize: deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText); Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet: // for analysis: Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>(); if (!string.IsNullOrEmpty(articleFileGT)) { deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT); foreach (KeyValuePair <int, string> token in deHyphenateTokens) { correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], "")); } } // Omit non character,single char, All Capitals word, and clean word from non alphabet: var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value); tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, "")); Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value); // Find Suggestion: if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell")) { string hunspellLog = ""; // find Suggestion using Hunspell: foreach (KeyValuePair <int, string> err in cleanTokens) { string errInNewSpell = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant(); List <string> hunspellSuggestions = new List <string>(); using (SpellEngine engine = new SpellEngine()) { LanguageConfig idConfig = new LanguageConfig(); idConfig.LanguageCode = "id"; idConfig.HunspellAffFile = "id_ID.aff"; idConfig.HunspellDictFile = "id_ID.dic"; idConfig.HunspellKey = ""; engine.AddLanguage(idConfig); bool correct = engine["id"].Spell(errInNewSpell); if (!correct) { hunspellSuggestions = engine["id"].Suggest(errInNewSpell); if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0])) { deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]"; } // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog } }); } } else { // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" } }); } } } } ResetControls(true); return; } //check only unique word (assumption:duplicate word is correct word) : Dictionary <int, string> checkTokens = cleanTokens; var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1); List <int> duplicateKeys = new List <int>(); foreach (var item in checkTokens) { foreach (var dup in duplicateValues) { if (item.Value == dup.Key) { duplicateKeys.Add(item.Key); } } } foreach (var dupkey in duplicateKeys) { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> { { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" } }); } checkTokens.Remove(dupkey); } //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ): errors = new Dictionary <int, string>(); foreach (KeyValuePair <int, string> token in checkTokens) { // change Soewandi to Modern Spelling: string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant(); // check word in Dictionary and Add to Error list if not there: int frequency; if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency)) { if (cbMethod.SelectedItem.ToString().Contains("Stemmer")) { // check again its stem in dictionary : string stem = stemmer.Stemming(wordToCheck); if (wordToCheck != stem && stemmer.checkStem(stem)) { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> { { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" } }); } } else // jika tidak ada di kamus: { errors.Add(token.Key, wordToCheck); } } else { errors.Add(token.Key, wordToCheck); } } else // jika ada di kamus: { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> { { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" } }); } } } // Find Suggestion: if (cbMethod.SelectedItem.ToString().EndsWith("Google")) { timerGoogle.Enabled = true; indexTimerGoogle = 0; return; } else { foreach (KeyValuePair <int, string> err in errors) { //get suggestion: string log; string suggestion; suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption()); // Change suggestion back to Old Spell if any suggestions: if (log != "No candidates") { suggestion = correction.ChangeNewToOldSpell(suggestion); } // update token dic with suggestion: if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase)) { deHyphenateTokens[err.Key] = "[" + suggestion + "]"; } // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log } }); } } ResetControls(true); } }
public static string ExtractStemFeatureFromSingleTokenAndUpdateItemFeatures(Stemmer stemmer, Dictionary<string, double> item, string tokenKey) { tokenKey = stemmer.Stem(tokenKey); item.IncreaseFeatureFrequency("stem_" + tokenKey, 1); return tokenKey; }
private unsafe void IndexDocuments(WriteableIndex index, Stemmer stemmer, byte *buffer, List <(string id, int from, int length)> docs)