Exemplo n.º 1
0
        public void StemTest()
        {
            Stemmer stemmer = new Stemmer();

            string input, expected, actual;

            input = "کتابی";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتاب‌هایی";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتابهایشان";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "اندیشه‌اش";
            expected = "اندیشه";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");
        }
Exemplo n.º 2
0
        public Dictionary <string, int> GetTopNWordsDictionary(int N)
        {
            string[] ignoreWords = { "*" };

            Dictionary <string, int> wordCount  = new Dictionary <string, int>();
            StringBuilder            sbFullText = new StringBuilder();

            foreach (children child in this.children)
            {
                sbFullText.Append(child.SubtreeText);
                sbFullText.Append(" ");
            }
            string[] allWords = GetAllWords(sbFullText.ToString());
            wordCount = new Dictionary <string, int>();


            Dictionary <string, string> stemParent = new Dictionary <string, string>();

            foreach (string word in allWords)
            {
                try
                {
                    string stemmed = Stemmer.GetStem(word);
                    if (stemParent.ContainsKey(stemmed))
                    {
                        if (stemParent[stemmed].Length < word.Length)
                        {
                            stemParent[stemmed] = word;
                        }
                    }
                    else
                    {
                        stemParent[stemmed] = word;
                    }
                    if (stopWords.Contains(stemmed.ToLower()))
                    {
                        continue;
                    }
                    if (!wordCount.ContainsKey(stemmed) && !ignoreWords.Contains(stemmed))
                    {
                        wordCount[stemmed] = 1;
                    }
                    else
                    {
                        wordCount[stemmed] += 1;
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
            wordCount = wordCount.OrderByDescending(x => x.Value).Take(N).ToDictionary(kvp => stemParent[kvp.Key], kvp => kvp.Value);
            return(wordCount);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Возвращает список проллемитизированных слов из запроса (если не может найти начальную форму то стреммит его т.е. обрезает окончание)
        /// </summary>
        /// <param name="request">список слов запроса</param>
        /// <returns> обработанный список слов из запроса</returns>
        public static List <string> GetStremmingWords(string[] request)
        {
            var result = new List <string>();

            foreach (var word in request)
            {
                var infWord = Analyser.FindAllSourceForm(word).FirstOrDefault();
                result.Add(infWord.SourceForm ?? Stemmer.Stemm(word));
            }
            return(result);
        }
Exemplo n.º 4
0
        private void GenerateInlines()
        {
            string[] searchWords = SearchIndex.StemWords(SearchIndex.GetWords(this.SearchText));
            if (searchWords.Length == 0)
            {
                return;
            }

            string[] inputWords = SearchIndex.GetWords(this.InputText);
            if (inputWords.Length == 0)
            {
                return;
            }

            List <string> highlightWords = new List <string>();
            Stemmer       stemmer        = new Stemmer();

            foreach (string word in inputWords)
            {
                if (Enumerable.Contains <string>(searchWords, stemmer.Stem(word)))
                {
                    highlightWords.Add(word);
                }
            }

            string text  = this.InputText;
            Regex  regex = GetRegexFromWordList(highlightWords.ToArray());
            int    index = 0;

            if (regex != null)
            {
                MatchCollection matches = regex.Matches(text);

                foreach (Match match in matches)
                {
                    if (match.Index > index)
                    {
                        this.Inlines.Add(new Run(text.Substring(index, match.Index - index)));
                    }

                    string searchWord = text.Substring(match.Index, match.Length);
                    this.Inlines.Add(new Bold(new Run(searchWord)));

                    index = match.Index + match.Length;
                }
            }

            if (index < text.Length)
            {
                this.Inlines.Add(new Run(text.Substring(index, text.Length - index)));
            }

            Assert.IsTrue(this.Inlines.Count != 0);
        }
Exemplo n.º 5
0
        /// <summary>
        /// Леммитизация слов из непосредственного запроса пользователя в векторном поиске
        /// </summary>
        /// <param name="request"></param>
        /// <returns></returns>
        public static string[] GetStremmingWordsForQuery(string[] request)
        {
            var result = new string[request.Length];

            for (int i = 0; i < request.Length; i++)
            {
                var infWord = Analyser.FindAllSourceForm(request[i]).FirstOrDefault();
                result[i] = (infWord.SourceForm ?? Stemmer.Stemm(request[i]));
            }

            return(result);
        }
Exemplo n.º 6
0
        public string StemText(string text)
        {
            string      result = "";
            TokenStream stream = Stemmer.TokenStream(String.Empty, new StringReader(text));

            while (stream.IncrementToken())
            {
                TermAttribute termAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));
                result = result + termAttr.Term() + " ";
            }

            return(result.Trim());
        }
Exemplo n.º 7
0
        public int GetTextForMark(string text)
        {
            Stemmer       stemmer      = new Stemmer();
            List <string> stemmedWords = new List <string>();
            var           words        = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);

            for (int i = 0; i < words.Count(); i++)
            {
                words[i] = stemmer.Stem(words[i]);
            }
            stemmedWords = words.ToList <string>();
            StreamReader reader = new StreamReader("correlations.txt");
            Dictionary <string, double> correlationTable = new Dictionary <string, double>();
            string line;

            while ((line = reader.ReadLine()) != null)
            {
                double   correlation;
                string[] splittedLine = line.Split(' ');
                if (splittedLine.Count() == 2)
                {
                    if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "")
                    {
                        correlationTable.Add(splittedLine[0], correlation);
                    }
                }
            }
            double logOfP = 0;

            foreach (var word in stemmedWords)
            {
                if (correlationTable.ContainsKey(word))
                {
                    logOfP += correlationTable[word];
                }
            }
            double pFraction = Math.Exp(logOfP);
            double result    = pFraction / (pFraction + 1);

            if (result > 0.5)
            {
                return(1);
            }
            return(0);
        }
Exemplo n.º 8
0
        private static Hashtable genStopwordTable(string path)
        {
            Hashtable    stopwordTable = new Hashtable();
            StreamReader stopFile      = new StreamReader(path);
            string       line;
            string       word;
            Stemmer      stemmer = new Stemmer();

            while ((line = stopFile.ReadLine()) != null)
            {
                stemmer.add(line.Trim().ToCharArray(), line.Length);
                stemmer.stem();
                word = stemmer.ToString();
                stopwordTable[word.ToLower()] = 1;
            }
            stopFile.Close();
            return(stopwordTable);
        }
        public int GetTextForMark(string text)
        {
            Stemmer stemmer = new Stemmer();
            List<string> stemmedWords = new List<string>();
            var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
            for(int i = 0; i< words.Count();i++)
            {
                words[i] = stemmer.Stem(words[i]);
            }
            stemmedWords = words.ToList<string>();
            StreamReader reader = new StreamReader("correlations.txt");
            Dictionary<string,double> correlationTable = new Dictionary<string,double>();
            string line;
            while((line = reader.ReadLine()) !=null)
            {
                double correlation;
                string[] splittedLine = line.Split(' ');
                if (splittedLine.Count() == 2)
                {
                    if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "")
                    {
                        correlationTable.Add(splittedLine[0], correlation);
                    }
                }

            }
            double logOfP = 0;
            foreach (var word in stemmedWords)
            {
                if (correlationTable.ContainsKey(word))
                {
                    logOfP += correlationTable[word];
                }
            }
            double pFraction = Math.Exp(logOfP) ;
            double result = pFraction/(pFraction + 1);
            if (result > 0.5)
            {
                return 1;
            }
            return 0;
        }
Exemplo n.º 10
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="noStopwordsBugReportList"></param>
        /// <returns></returns>
        private List <string> ApplyStemming(List <string> noStopwordsBugReportList)
        {
            List <string> stemmedList = new List <string>();

            foreach (var item in noStopwordsBugReportList)
            {
                string[] words           = item.Split(' ');
                string   finalStemOutput = "";
                foreach (string word in words)
                {
                    Stemmer temp = new Stemmer();
                    temp.add(word.ToCharArray(), word.Length);
                    temp.stem();
                    var stemOutput = temp.ToString();
                    finalStemOutput += stemOutput + " ";
                }
                stemmedList.Add(finalStemOutput);
            }
            return(stemmedList);
        }
Exemplo n.º 11
0
        /// <summary>
        /// Processes the specified text.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns>The resulting document object.</returns>
        public Document Process(string text)
        {
            var TempText = NormalizerManager.Normalize(text);
            var Tokens   = Tokenizer.Tokenize(TempText, TokenizerLanguage);

            Tokens = NormalizerManager.Normalize(Tokens);
            Tokens = Stemmer.Stem(Tokens, StemmerLanguage);
            Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage);

            var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            for (int x = 0; x < Sentences.Length; ++x)
            {
                var Sentence = Sentences[x];
                Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage);
            }
            Tokens    = EntityFinder.Find(Tokens, EntityFinderType);
            Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage));
        }
Exemplo n.º 12
0
        Dictionary <string, HashSet <int> > GetWordIDMapping(children child)
        {
            Dictionary <string, HashSet <int> > wordIDMapping = new Dictionary <string, HashSet <int> >();

            string[] allWords = GetAllWords(child.text);

            foreach (string word in allWords)
            {
                if (stopWords.Contains(word.ToLower()))
                {
                    continue;
                }
                if (word.Length < 3 && word.Any(c => char.IsLower(c)))
                {
                    continue;
                }
                string stem = Stemmer.GetStem(word);
                if (!wordIDMapping.ContainsKey(stem))
                {
                    wordIDMapping[stem] = new HashSet <int>();
                }
                wordIDMapping[stem].Add(child.id);
            }
            foreach (children childitem in child.Children)
            {
                Dictionary <string, HashSet <int> > mapping = GetWordIDMapping(childitem);
                foreach (var kvp in mapping)
                {
                    if (wordIDMapping.ContainsKey(kvp.Key))
                    {
                        wordIDMapping[kvp.Key].UnionWith(kvp.Value);
                    }
                    else
                    {
                        wordIDMapping[kvp.Key] = kvp.Value;
                    }
                }
            }
            return(wordIDMapping);
        }
Exemplo n.º 13
0
        static double pmi(string ti, string tj, Dictionary <string, double> prob)
        {
            ti = ti.ToLower();
            Stemmer s = new Stemmer();

            char[] arr = ti.ToCharArray();
            s.add(arr, ti.Length);
            s.stem();
            ti = s.ToString();

            tj = tj.ToLower();
            Stemmer s2 = new Stemmer();

            char[] arr2 = tj.ToCharArray();
            s2.add(arr2, tj.Length);
            s2.stem();
            tj = s2.ToString();
            double pmi = 0;

            if (!prob.ContainsKey(ti + " " + tj) && !prob.ContainsKey(tj + " " + ti))
            {
                return(0);
            }
            if (prob.ContainsKey(ti + " " + tj))
            {
                pmi += prob[ti + " " + tj];
            }
            if (prob.ContainsKey(tj + " " + ti))
            {
                pmi += prob[tj + " " + ti];
            }
            if (!prob.ContainsKey(ti) || !prob.ContainsKey(tj))
            {
                return(0);
            }
            pmi /= (prob[ti] * prob[tj]);
            pmi  = Math.Log(pmi, 2);
            return(pmi);
        }
Exemplo n.º 14
0
        public CheakSpell()
        {
            try
            {
                _globalDic      = new List <string>();
                _userDic        = new List <string>();
                _ignoreList     = new List <string>();
                _stopWordList   = new List <string>();
                _ignoreCharList = new List <char>();

                var persianWordFrequencyOpration = new PS_PersianWordFrequencyOpration(); //load from DB


                var listParsianWordfreq = persianWordFrequencyOpration.GetAll();
                _sundex = new Soundex(listParsianWordfreq.Where(x => x.Sundex.Length > 0).ToList());
                _norvan = new NorvigSpellChecker(listParsianWordfreq);
                _stemmr = new Stemmer(listParsianWordfreq);
                foreach (var item in listParsianWordfreq)
                {
                    _globalDic.Add(item.Val1.Trim());
                }


                var lsStop = new PS_StopWordOpration();


                foreach (var item in lsStop.GetAll())
                {
                    _stopWordList.Add(item.Val1.Trim());
                }

                var userDicOpration = new UserDicOpration();
                _userDic = userDicOpration.LoadAll();
            }
            catch (Exception)
            {
                // ignored
            }
        }
Exemplo n.º 15
0
        /// <summary>
        /// Method to filter input text.
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private string FilterText(string text)
        {
            var currDir = System.Environment.GetFolderPath(System.Environment.SpecialFolder.ApplicationData);

            // Combine the base folder with your specific folder....
            string specificFolder = System.IO.Path.Combine(currDir, "MARC 3.0");

            // Check if folder exists and if not, create it
            if (!Directory.Exists(specificFolder))
            {
                Directory.CreateDirectory(specificFolder);
            }


            text.Replace('.', ' ');
            if (NoSWCheckboxCheckedState)
            {
                StopWordRemoval.StopWordRemoval temp = new StopWordRemoval.StopWordRemoval(text, specificFolder);
                text = temp.output;
            }


            if (STCheckboxCheckedState)
            {
                string[] words           = text.Split(' ');
                string   finalStemOutput = "";
                foreach (string word in words)
                {
                    Stemmer temp = new Stemmer();
                    temp.add(word.ToCharArray(), word.Length);
                    temp.stem();
                    var stemOutput = temp.ToString();
                    finalStemOutput += stemOutput + " ";
                }
                text = finalStemOutput;
            }
            text = RemoveSpecialCharacters(text);
            return(text);
        }
Exemplo n.º 16
0
        private static string getWord(string word, Hashtable stopwordTable)
        {
            Stemmer stemmer = new Stemmer();
            string  result  = word.ToLower().Trim(new Char[] { '_', '-', '.' });
            double  Num;
            bool    isNum = double.TryParse(word, out Num);

            if (isNum)
            {
                return(null);
            }
            stemmer.add(result.ToCharArray(), result.Length);
            stemmer.stem();
            result = stemmer.ToString();
            if (result.Length == 0)
            {
                return(null);
            }
            if (stopwordTable.ContainsKey(result))
            {
                return(null);
            }
            return(result);
        }
Exemplo n.º 17
0
        /// <summary>
        /// This method retrive a single query results
        /// </summary>
        /// <param name="query"></param>
        /// <param name="language"></param>
        /// <param name="queryId"></param>
        public void retriveSingleQuery(string query, string language, int queryId)
        {
            Stemmer stemmer = new Stemmer();

            string[]      parseQuery    = searcher.ParseQuery(query);
            List <string> semanticQuery = searcher.AddSemantic(parseQuery.ToList());
            List <string> queryList     = parseQuery.ToList();
            Dictionary <string, Dictionary <string, int> > QueryTermsOccurrences = new Dictionary <string, Dictionary <string, int> >();
            Dictionary <string, Dictionary <string, int> > SemanticQuery         = new Dictionary <string, Dictionary <string, int> >();

            if (Properties.Settings.Default.stemmer)
            {
                for (int i = 0; i < queryList.Count; i++)
                {
                    queryList[i] = stemmer.stemTerm(queryList[i]);
                }
            }
            QueryTermsOccurrences = searcher.AllQueryOccurrences(queryList.ToArray(), language);
            SemanticQuery         = searcher.AllQueryOccurrences(semanticQuery.ToArray(), language);
            //List<string> cluster = searcher.index.buildCarrot2(parseQuery, QueryPerformances);
            ConcurrentDictionary <string, double> ranking = ranker.CalculateTotalRank(queryList.ToArray(), semanticQuery, QueryTermsOccurrences, SemanticQuery);

            QueriesResults[queryId] = ranker.sortRanking(ranking);
        }
Exemplo n.º 18
0
        static List <int> positions(string word, string docName)
        {
            word = word.ToLower();
            Stemmer s = new Stemmer();

            char[] arr = word.ToCharArray();
            s.add(arr, word.Length);
            s.stem();
            word = s.ToString();
            List <int> pos = new List <int>();

            using (StreamReader sr = new StreamReader("Web_Documents/" + docName))
            {
                string line;
                int    count = 0;
                char[] delim = { '.', ',', ';', ':', '-', '!', '?', '"', '\'', '`', '(', ')', '[', ']', '{', '}', ' ', '\t' };
                while ((line = sr.ReadLine()) != null)
                {
                    //line = line.Replace(".", " ");
                    //line = line.Replace(",", " ");
                    //line = line.Replace(";", " ");
                    //line = line.Replace(":", " ");
                    //line = line.Replace("-", " ");
                    //line = line.Replace("!", " ");
                    //line = line.Replace("?", " ");
                    //line = line.Replace("\"", "");
                    //line = line.Replace("'", " ");
                    //line = line.Replace("`", " ");
                    //line = line.Replace("(", "");
                    //line = line.Replace(")", "");
                    //line = line.Replace("[", "");
                    //line = line.Replace("]", "");
                    //line = line.Replace("{", "");
                    //line = line.Replace("}", "");
                    string[] words = line.Split(delim, StringSplitOptions.RemoveEmptyEntries);
                    //for (int i = 0; i < words.Length; i++)
                    //{
                    //    Console.Write(words[i] + " ");
                    //}
                    //Console.WriteLine();
                    for (int i = 0; i < words.Length; i++)
                    {
                        words[i] = words[i].ToLower();
                        Stemmer s2   = new Stemmer();
                        char[]  arr2 = words[i].ToCharArray();
                        s2.add(arr2, words[i].Length);
                        s2.stem();
                        words[i] = s2.ToString();
                        count++;
                        if (words[i].Equals(word))
                        {
                            pos.Add(count);
                            //Console.Write(count);
                        }
                    }
                    //Console.WriteLine();
                    //Console.Read();
                }
            }

            return(pos);
        }
    private List <string> SplitsTheParagraphInWords(string text, bool isPositive)
    {
        List <string> WordsList = new List <string>();
        Stemmer       stemmer   = new Stemmer();

        text = text.Trim().ToLower();

        if (text[text.Length - 1] != '.')
        {
            text += ".";
        }

        text = text.Replace(',', ' ');
        text = text.Replace(';', ' ');
        text = text.Replace(':', ' ');
        text = text.Replace('\"', ' ');
        text = text.Replace('\'', ' ');
        text = text.Replace('!', ' ');
        text = text.Replace('?', ' ');
        text = text.Replace('(', ' ');
        text = text.Replace(')', ' ');
        text = text.Replace(']', ' ');
        text = text.Replace('[', ' ');
        text = text.Replace('<', ' ');
        text = text.Replace('>', ' ');
        text = text.Replace('+', ' ');
        text = text.Replace('*', ' ');
        text = text.Replace('%', ' ');
        text = text.Replace('&', ' ');
        text = text.Replace('$', ' ');
        text = text.Replace('=', ' ');
        text = text.Replace('^', ' ');
        text = text.Replace('-', ' ');
        text = text.Replace('/', ' ');
        text = text.Replace('\\', ' ');
        text = text.Replace('\'', ' ');
        text = text.Replace('@', ' ');
        text = text.Replace('_', ' ');


        string[] Sentences = text.Split('.');

        for (int i = 0; i < Sentences.Length; i++)
        {
            if (!string.IsNullOrEmpty(Sentences[i]))
            {
                string[] Words = Sentences[i].Trim().Split(' ');

                for (int j = 0; j < Words.Length; j++)
                {
                    if (!IsWordContractionOrStop(Words[j]) && !string.IsNullOrEmpty(Words[j]) && Words[j].Length > 3 && Words[j].Length < 13)
                    {
                        Words[j] = StemTheWord(Words[j], stemmer);

                        if (!IsStemmedWordPartOfList(Words[j], isPositive))
                        {
                            if (!WordsList.Contains(Words[j]))
                            {
                                if (Words[j].Length > 2)
                                {
                                    WordsList.Add(Words[j]);

                                    if (isPositive)
                                    {
                                        PositiveTimes.Add(1);
                                        NegativeTimes.Add(0);
                                    }
                                    else
                                    {
                                        PositiveTimes.Add(0);
                                        NegativeTimes.Add(1);
                                    }
                                }
                            }
                            else
                            {
                                try
                                {
                                    int indexOfWord = WordsList.IndexOf(Words[j]);

                                    if (isPositive)
                                    {
                                        PositiveTimes[PositiveTimes.Count - WordsList.Count + indexOfWord]++;
                                    }
                                    else
                                    {
                                        NegativeTimes[NegativeTimes.Count - WordsList.Count + indexOfWord]++;
                                    }
                                }
                                catch
                                {
                                    Debug.Log("Error Index \"SplitsTheParagraphInWords\" Method");
                                }
                            }
                        }
                    }
                }
            }
        }

        return(WordsList);
    }
Exemplo n.º 20
0
        public unsafe (WriteableIndex index, int files, int docs, long size) IndexAllParallel(IndexOptions options, string folder)
        {
            var timer = Stopwatch.StartNew();

            var files = Directory.GetFiles(folder, "*", SearchOption.AllDirectories);

            Console.WriteLine("Found files: " + files.Length + " - took: " + timer.ElapsedMilliseconds + "ms");

            timer.Restart();

            var tasks    = new List <Task <WriteableIndex> >();
            var parallel = Environment.ProcessorCount;

            var docsCount = 0;
            var fileCount = 0;
            var sizeSum   = 0L;
            var sizeLocal = 0L;

            for (var p = 0; p < parallel; p++)
            {
                var taskNumber = p;
                tasks.Add(Task.Run(() =>
                {
                    var localIndex   = new WriteableIndex(options);
                    var localStemmer = new Stemmer();
                    var localParser  = new Parser();

                    var localPart = (files.Length / parallel);
                    var from      = taskNumber * localPart;
                    var to        = taskNumber == parallel - 1 ? files.Length : from + localPart;

                    for (var i = from; i < to; i++)
                    {
                        using (var mmf = MemoryMappedFile.CreateFromFile(files[i], FileMode.Open))
                            using (var accessor = mmf.CreateViewAccessor())
                            {
                                byte *buffer = null;
                                accessor.SafeMemoryMappedViewHandle.AcquirePointer(ref buffer);
                                var len = new FileInfo(files[i]).Length;

                                var docs = localParser.ParseFileFast(buffer, len);

                                Interlocked.Add(ref docsCount, docs.Count);
                                Interlocked.Add(ref sizeSum, len);
                                Interlocked.Add(ref sizeLocal, len);

                                IndexDocuments(localIndex, localStemmer, buffer, docs);

                                accessor.SafeMemoryMappedViewHandle.ReleasePointer();
                            }


                        if (Interlocked.Increment(ref fileCount) % 100 == 0)
                        {
                            Console.WriteLine(
                                "Finished: " + fileCount + " - "
                                + Math.Round((Interlocked.Read(ref sizeLocal) * 0.000001d)) + " mb - " +
                                +Math.Round((Interlocked.Read(ref sizeLocal) * 0.000001d) / (timer.ElapsedMilliseconds / 1000d), 2) + " mb/s");
                            timer.Restart();

                            Interlocked.Exchange(ref sizeLocal, 0);
                        }
                    }

                    Console.WriteLine("task finished: " + taskNumber);

                    return(localIndex);
                }
                                   ));
            }

            var all = Task.WhenAll(tasks).Result;

            Console.WriteLine("Index building completed. Merging indices ...");

            var master = all[0];

            var mergeTime = Stopwatch.StartNew();

            for (var i = 1; i < all.Length; i++)
            {
                master.Merge(all[i]);
            }

            mergeTime.Stop();
            Console.WriteLine("Merge complete after: " + mergeTime.ElapsedMilliseconds + " ms");

            timer.Stop();

            return(master, fileCount, docsCount, sizeSum);
        }
Exemplo n.º 21
0
        /// <summary>
        /// This is the main method of the pre query engine. This is method processes all documents in the corpus and building the index.
        /// </summary>
        public void engine()
        {
            stopWatch.Start();
            string[] files = rf.getCorpusFilesFromSource();
            ConcurrentBag <string> languagesConcurrentBag = new ConcurrentBag <string>();

            //Thread t = new Thread(() => indexer.mergeQueueFirstThread());
            //t.Start();
            foreach (string filePath in files)
            {
                if (filePath.EndsWith("stop_words.txt"))
                {
                    continue;
                }
                string[] docs = rf.seperateDocumentsFromFile(filePath);
                ConcurrentDictionary <string, Dictionary <string, int> > ContinuTermsFileDic = new ConcurrentDictionary <string, Dictionary <string, int> >();
                ConcurrentDictionary <string, string> tempFileDictionary = new ConcurrentDictionary <string, string>();
                Parallel.ForEach(docs, new ParallelOptions {
                    MaxDegreeOfParallelism = 1
                }, (doc) =>
                {
                    Stemmer stemmer = new Stemmer();
                    Dictionary <string, int> uniqeTermsAtDoc = new Dictionary <string, int>();
                    string metaData;
                    string text;
                    rf.getMetaDataAndTextFromDoc(doc, out metaData, out text);
                    string docNo = indexer.AddDocFromMetaData(metaData);
                    //if (docNo.Equals("FBIS4-11824"))
                    //{
                    //    string s = filePath.ToString();
                    //}
                    languagesConcurrentBag.Add(indexer.documentDictionary[docNo].originalLanguage);
                    string[] stringSeparators = new string[] { " ", "\n", "...", "--", "?", ")", "(", "[", "]", "\"", "&", "_", ";", "~", "|" };
                    string[] textArray        = text.ToLower().Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries);
                    for (int i = 0; i < textArray.Length; i++)
                    {
                        textArray[i] = parser.cutAllsigns(textArray[i]);
                    }
                    if (textArray.Length == 0)
                    {
                        return;
                    }
                    List <string> textList = textArray.ToList();
                    textList.Add("");
                    textList.Add("");
                    textList.Add("");
                    textList.Add("");
                    string lastParsTerm = "";

                    for (int i = 0; i < textList.Count - 4; i++)
                    {
                        string parsedTerm1;
                        string parsedTerm2;
                        if (parser.checkForStopWord(textList[i]) == 1 && !textList[i].Equals("between"))
                        {
                            continue;
                        }
                        else
                        {
                            int jump = parser.parseTerm(ref textList, i, out parsedTerm1, out parsedTerm2);
                            if (jump >= 0)
                            {
                                i += jump;
                                //stemmer
                                if (Properties.Settings.Default.stemmer)
                                {
                                    parsedTerm1 = stemmer.stemTerm(parsedTerm1);
                                }
                                AddTermUniqe(parsedTerm1, uniqeTermsAtDoc);
                                if (i > 0)
                                {
                                    AddAutoCompletion(lastParsTerm, parsedTerm1, ContinuTermsFileDic);
                                }
                                lastParsTerm = parsedTerm1;


                                if (parsedTerm2 != null)
                                {
                                    if (Properties.Settings.Default.stemmer)
                                    {
                                        parsedTerm2 = stemmer.stemTerm(parsedTerm2);
                                    }
                                    AddTermUniqe(parsedTerm2, uniqeTermsAtDoc);
                                    // lastParsTerm = parsedTerm2;
                                }
                            }
                            else
                            {
                                if (parsedTerm1 != null && !textList[i].Equals("between") && !parsedTerm1.Equals(""))
                                {
                                    if (Properties.Settings.Default.stemmer)
                                    {
                                        parsedTerm1 = stemmer.stemTerm(parsedTerm1);
                                    }
                                    AddTermUniqe(parsedTerm1, uniqeTermsAtDoc);
                                    if (i > 0)
                                    {
                                        AddAutoCompletion(lastParsTerm, parsedTerm1, ContinuTermsFileDic);
                                    }
                                    lastParsTerm = parsedTerm1;
                                }
                            }
                        }
                    }
                    indexer.AddToMetaData(uniqeTermsAtDoc, docNo);

                    CalWij(uniqeTermsAtDoc, docNo);

                    indexer.addUniqueDicToTempDic(ref tempFileDictionary, uniqeTermsAtDoc, docNo);

                    indexer.addUniqueDicToMainDic(uniqeTermsAtDoc);
                });

                indexer.addFileDicToDisk(tempFileDictionary);
                AddCompletionDicToMain(ContinuTermsFileDic);
            }
            indexer.stop = false;
            //t.Join();
            indexer.mergeQueue();
            indexer.updateTermPointers();
            indexer.saveTermDictionary();
            indexer.saveDocumentDictionary();
            stopWatch.Stop();
            LanguagesList = new List <string>(languagesConcurrentBag.Distinct());
            WriteLanguagesToDisk(languagesList);
            int sum = indexer.countNumbers();

            System.Windows.MessageBox.Show("Inverted index is complete. \nNumber of terms: " + indexer.mainTermDictionary.Count() + ".\nNumber of documents: " + indexer.documentDictionary.Count() + "\nRun time: " + stopWatch.ElapsedMilliseconds / 1000);
        }
Exemplo n.º 22
0
        public Dictionary <string, List <CommentObj> > GetNamedObjects(int N)
        {
            StringBuilder sbAllWords = new StringBuilder();

            foreach (children child in children)
            {
                sbAllWords.Append(child.SubtreeText);
                sbAllWords.Append(" ");
            }
            string[] allWords = GetAllWords(sbAllWords.ToString());
            Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords);
            List <string>         namedObjects = new List <string>();
            children              rootNode     = new children();
            List <HashSet <int> > rootChildIDs = new List <HashSet <int> >();

            foreach (children child in children)
            {
                GetChildIDHashSetList(child);
                HashSet <int> currChildIDs = new HashSet <int>();
                currChildIDs.Add(child.id);
                foreach (var item in child.ChildIDList)
                {
                    currChildIDs.UnionWith(item);
                }
                rootChildIDs.Add(currChildIDs);
            }
            rootNode.ChildIDList = rootChildIDs;
            NodeList             = new List <children>();
            NodeList.Add(rootNode);
            foreach (children child in children)
            {
                PopulateNodeList(child);
            }
            Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping();
            //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>();
            Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >();

            foreach (var kvp in wordIDMapping)
            {
                List <children> currLCAList = new List <children>();
                int             numLCAs     = 0;
                foreach (children node in NodeList)
                {
                    int numBranchesWithWord = 0;
                    foreach (var childIDBranch in node.ChildIDList)
                    {
                        if (childIDBranch.Intersect(kvp.Value).Count() > 0)
                        {
                            numBranchesWithWord += 1;
                        }
                    }
                    if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1)
                    {
                        currLCAList.Add(node);
                    }
                }
                WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList;
            }
            namedObjects = WordLCAList
                           .OrderByDescending(x => x.Value.Count)
                           .Select(x => x.Key)
                           .Where(y => CommonWords.GetFrequency(y) < 1)
                           .Where(a => char.IsUpper(a[0]))
                           .Where(b => b.Length > 1)
                           .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s")))
                           .Take(N)
                           .ToList();
            //namedObjects.Sort();
            Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >();

            foreach (string namedObject in namedObjects)
            {
                List <CommentObj> commentObjsForWord = new List <CommentObj>();
                string            stem        = Stemmer.GetStem(namedObject);
                HashSet <int>     idsWithWord = wordIDMapping[stem];
                foreach (int id in idsWithWord)
                {
                    children   child      = GetNodeById(id);
                    CommentObj commentObj = new CommentObj()
                    {
                        Id = id, Text = child.text
                    };
                    commentObjsForWord.Add(commentObj);
                }
                namedObjectDictionary[namedObject] = commentObjsForWord;
            }
            var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]);

            return(ordered);
        }
Exemplo n.º 23
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }
Exemplo n.º 24
0
 /// <summary>
 /// Класс для проведения разбиения слов на группы
 /// </summary>
 /// <param name="listOfWords">Перечень анализируемых слов</param>
 /// <param name="stemmerToUse">Используемый стеммер(в зависимости от языка)</param>
 /// <param name="detectAcuracy">Точность соотвествия между выделенными основами</param>
 public WordAnalizer(List<string> listOfWords, Stemmer.iStemmer stemmerToUse, double detectAcuracy) 
 {
     InputList = listOfWords;
     Stemmer = stemmerToUse;
     Accuracy = detectAcuracy;
 }
        private static void GenerateXmlFromFile(string fileName)
        {
            Stemmer stemmer = new Stemmer();
            var htmlDoc = new HtmlDocument();
            htmlDoc.Load(fileName, Encoding.UTF8);
            var rootNode = htmlDoc.DocumentNode;
            var marksText = rootNode.SelectNodes("//span[@class='grade-label']");
            List<int> grades = new List<int>();
            // Get marks list
            if (marksText != null)
            {
                foreach (var mark in marksText)
                {
                    switch (mark.InnerText)
                    {
                        case "отличная модель":
                            grades.Add(5); break;
                        case "хорошая модель":
                            grades.Add(4); break;
                        case "обычная модель":
                            grades.Add(3); break;
                        case "плохая модель":
                            grades.Add(2); break;
                        case "ужасная модель":
                            grades.Add(1); break;
                        default: break;
                    }
                }
                //Get texts for marks
                List<string> advantages = new List<string>();
                List<string> disadvantages = new List<string>();
                List<string> comments = new List<string>();
                var texts = rootNode.SelectNodes("//div[@class='data']");
                foreach (var text in texts)
                {

                    if (text.ChildNodes[2].Name == "div")
                    {
                        //Достоинства
                        advantages.Add(text.ChildNodes[3].InnerText);
                        //Недостатки
                        if (text.ChildNodes.Count == 5)
                        {
                            disadvantages.Add(text.ChildNodes[4].InnerText);
                        }
                        //Комментарий
                        if (text.ChildNodes.Count == 6)
                        {
                            comments.Add(text.ChildNodes[5].InnerText);
                        }
                    }
                    else
                    {
                        //Достоинства
                        advantages.Add(text.ChildNodes[2].InnerText);
                        //Недостатки
                        if (text.ChildNodes.Count == 4)
                        {
                            disadvantages.Add(text.ChildNodes[3].InnerText);
                        }
                        //Комментарий
                        if (text.ChildNodes.Count == 5)
                        {
                            comments.Add(text.ChildNodes[4].InnerText);
                        }
                    }
                }

                //Generating XML
                for (int i = 0; i < advantages.Count; i++)
                {
                    var xml = new XmlDocument();
                    var xmlNode = xml.CreateNode(XmlNodeType.XmlDeclaration, "", "");
                    xml.AppendChild(xmlNode);
                    var xmlElem = xml.CreateElement("", "review", "");
                    xml.AppendChild(xmlElem);

                    char[] delimiterChars = { ' ', ',', '.', ':', '\t' };

                    if (advantages.Count > i)
                    {
                        string result_advantages = String.Empty;
                        string[] advantages_split = advantages[i].Split(delimiterChars);
                        foreach (var word in advantages_split)
                        {
                            result_advantages = String.Concat(result_advantages," ",stemmer.Stem(word));
                        }
                        var xmlAdvantages = xml.CreateElement("", "advantages", "");
                        var xmlAdvatagesText = xml.CreateTextNode(result_advantages);
                        xmlAdvantages.AppendChild(xmlAdvatagesText);
                        xml.LastChild.AppendChild(xmlAdvantages);
                    }

                    if (disadvantages.Count > i)
                    {
                        string result_disadvantages = String.Empty;
                        string[] disadvantages_split = disadvantages[i].Split(delimiterChars);
                        foreach (var word in disadvantages_split)
                        {
                            result_disadvantages = String.Concat(result_disadvantages, " ", stemmer.Stem(word));
                        }
                        var xmlDisadvantages = xml.CreateElement("", "disadvantages", "");
                        var xmlDisadvantagesText = xml.CreateTextNode(stemmer.Stem(result_disadvantages));
                        xmlDisadvantages.AppendChild(xmlDisadvantagesText);
                        xml.LastChild.AppendChild(xmlDisadvantages);
                    }

                    if (comments.Count > i)
                    {
                        string result_comments = String.Empty;
                        string[] comments_split = comments[i].Split(delimiterChars);
                        foreach (var word in comments_split)
                        {
                            result_comments = String.Concat(result_comments, " ", stemmer.Stem(word));
                        }
                        var xmlComments = xml.CreateElement("", "comments", "");
                        var xmlCommentsText = xml.CreateTextNode(result_comments);
                        xmlComments.AppendChild(xmlCommentsText);
                        xml.LastChild.AppendChild(xmlComments);
                    }
                    if (grades.Count > i)
                    {
                        var xmlGrade = xml.CreateElement("", "grade", "");
                        var xmlGradeText = xml.CreateTextNode(grades[i].ToString());
                        xmlGrade.AppendChild(xmlGradeText);
                        xml.LastChild.AppendChild(xmlGrade);
                    }
                    //generate path!
                    string path = String.Concat("c:\\xml\\", "xml_", Path.GetFileName(fileName), i.ToString(), ".xml");
                    xml.Save(path);

                }
            }
        }
Exemplo n.º 26
0
        public string GetSuggestion(string sPNameForGetCandidates, Dictionary <int, string> deHyphenateTokens, int key, string value, out string log, bool withStemAndAffixCorr, bool withSearch)
        {
            log = "";
            string error = value;

            string suggestion;
            int    minSameBigramAmount            = getMinSameBigramAmount(error.Length);
            int    minLengthVariant               = 1; // batasi error Addition/Deletion =1 karakter.
            int    maxLevensthein                 = 2;
            int    minCandidates                  = 10;
            int    maxCandidates                  = 10;
            string candidatesLog                  = "";
            List <CorrectionCandidate> candidates = new List <CorrectionCandidate>();

            candidates = GetCandidates(sPNameForGetCandidates, key, error, minSameBigramAmount, minLengthVariant, maxLevensthein, out candidatesLog);
            if (candidates.Count < minCandidates)
            {
                // bila tidak ada juga kurangi minSameBigramAmount :
                if (minSameBigramAmount - 1 > 0)
                {
                    candidates = GetCandidates(sPNameForGetCandidates, key, error, minSameBigramAmount - 1, minLengthVariant, maxLevensthein, out candidatesLog);
                    if (candidates.Count < minCandidates && withStemAndAffixCorr)
                    {
                        // bila tidak ada juga ambil dari stem :
                        Stemmer stemmer = new Stemmer();
                        string  prefix; string suffix;
                        string  errorRoot = stemmer.StemmingWithoutChecking(error, out prefix, out suffix);
                        if (error != errorRoot && errorRoot.Length >= 3)
                        {
                            minSameBigramAmount = getMinSameBigramAmount(errorRoot.Length);
                            if (minSameBigramAmount > 0)
                            {
                                var temp1 = GetCandidates(sPNameForGetCandidates, key, error, errorRoot, prefix, suffix, minSameBigramAmount, minLengthVariant, maxLevensthein, out candidatesLog);
                                candidates.AddRange(temp1.Where(x => candidates.FirstOrDefault(y => y.Candidate == x.Candidate) == null).ToList());
                            }
                        }
                        // bila tidak ada juga coba koreksi afiksnya :
                        string candidatesFromAffixCorrLog;
                        var    temp2 = GetCandidatesFromAffixCorrection(key, error, out candidatesFromAffixCorrLog);
                        candidates.AddRange(temp2.Where(x => candidates.FirstOrDefault(y => y.Candidate == x.Candidate) == null).ToList());
                        candidatesLog += candidatesFromAffixCorrLog;
                    }
                }
            }
            if (candidates.Count == 0)
            {
                log = "No candidates";
                return(value);    // Asumsi kalau itu kata yang benar dan tidak tercover di kamus
            }
            else if (candidates.Count == 1)
            {
                log += candidatesLog;
                return(candidates[0].Candidate);
            }
            else
            {
                var candidatesEqError = candidates.Where(o => o.Candidate.Equals(error, StringComparison.OrdinalIgnoreCase)).ToList();
                if (candidatesEqError.Count > 0)
                {
                    log = "candidate=error";
                    return(candidatesEqError[0].Candidate);    // bila ada kandidat yg sama persis dg error maka jadikan itu saran;
                }
                if (candidates.Count > maxCandidates)
                {
                    //batasi jumlah kandidat by smallest levenstein then by Frequency:
                    var temp = candidates.OrderBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList();
                    candidates = new List <CorrectionCandidate>();   //reset
                    for (int i = 0; i < maxCandidates; i++)
                    {
                        candidates.Add(temp[i]);
                    }
                }


                string logsearch;
                List <CorrectionCandidate> SortedCandidates = new List <CorrectionCandidate>();
                if (withSearch)
                {
                    // for each candidate search using Trigram:
                    foreach (CorrectionCandidate candidate in candidates)
                    {
                        candidate.NGram = 3;
                        candidate.Hits  = GetTrigramSearchHits(deHyphenateTokens, candidate, false, out logsearch);
                        log            += string.Format("[{0}:{1},{2}]", candidate.Candidate, logsearch, candidate.Hits);
                    }

                    // if no one has hits > 0 then search bigram for each candidate:
                    if (candidates.Where(p => p.Hits > 0).Count() == 0)
                    {
                        foreach (CorrectionCandidate candidate in candidates)
                        {
                            // Search using Bigram:
                            candidate.NGram = 2;
                            candidate.Hits  = GetBigramSearchHits(deHyphenateTokens, candidate, false, out logsearch);
                            log            += string.Format("[{0}:{1},{2}]", candidate.Candidate, logsearch, candidate.Hits);
                        }
                        SortedCandidates = candidates.OrderByDescending(o => o.Hits).ThenBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList();
                    }
                    else // if trigram search results ones that has hits > 0 then sorting by largest hits, then smallest levenstein:
                    {
                        SortedCandidates = candidates.OrderByDescending(o => o.Hits).ThenBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList();
                    }
                }
                else
                {
                    SortedCandidates = candidates.OrderBy(o => o.Levensthein).ThenByDescending(o => o.Frequency).ToList();
                }

                // FINAL SUGGESTION:
                suggestion = SortedCandidates[0].Candidate;

                log = candidatesLog + log;
                return(suggestion);
            }
        }
Exemplo n.º 27
0
        private void btCorrectIt_Click(object sender, EventArgs e)
        {
            if (cbMethod.SelectedItem.ToString() == "-- Choose method --")
            {
                MessageBox.Show("Choose method first", "", MessageBoxButtons.OK);
                return;
            }
            if (!File.Exists(txOCR.Text.Trim()))
            {
                MessageBox.Show("Browse file first", "", MessageBoxButtons.OK);
                return;
            }

            ResetControls(false);

            articleFile = txOCR.Text.Trim();
            Correction correction = new Correction();
            Stemmer    stemmer    = new Stemmer();

            // DeHyphenate and clean text:
            string dehyphenatedText = correction.DeHyphenate(articleFile);

            rtbOCR.Text = dehyphenatedText;

            // for analysis:
            string dehyphenatedTextGT = "";

            if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"))
            {
                articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt";
            }
            articleFileName = Path.GetFileName(articleFile);
            if (!string.IsNullOrEmpty(articleFileGT))
            {
                dehyphenatedTextGT = correction.DeHyphenate(articleFileGT);
            }

            // tokenize:
            deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText);

            Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet:

            // for analysis:
            Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>();

            if (!string.IsNullOrEmpty(articleFileGT))
            {
                deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT);
                foreach (KeyValuePair <int, string> token in deHyphenateTokens)
                {
                    correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], ""));
                }
            }

            // Omit non character,single char, All Capitals word, and clean word from non alphabet:
            var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value);

            tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, ""));
            Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value);



            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell"))
            {
                string hunspellLog = "";
                // find Suggestion using Hunspell:
                foreach (KeyValuePair <int, string> err in cleanTokens)
                {
                    string        errInNewSpell       = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant();
                    List <string> hunspellSuggestions = new List <string>();
                    using (SpellEngine engine = new SpellEngine())
                    {
                        LanguageConfig idConfig = new LanguageConfig();
                        idConfig.LanguageCode     = "id";
                        idConfig.HunspellAffFile  = "id_ID.aff";
                        idConfig.HunspellDictFile = "id_ID.dic";
                        idConfig.HunspellKey      = "";
                        engine.AddLanguage(idConfig);
                        bool correct = engine["id"].Spell(errInNewSpell);
                        if (!correct)
                        {
                            hunspellSuggestions = engine["id"].Suggest(errInNewSpell);
                            if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0]))
                            {
                                deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]";
                            }
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog }
                                });
                            }
                        }
                        else
                        {
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" }
                                });
                            }
                        }
                    }
                }
                ResetControls(true);
                return;
            }


            //check only unique word (assumption:duplicate word is correct word) :
            Dictionary <int, string> checkTokens = cleanTokens;
            var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1);

            List <int> duplicateKeys = new List <int>();

            foreach (var item in checkTokens)
            {
                foreach (var dup in duplicateValues)
                {
                    if (item.Value == dup.Key)
                    {
                        duplicateKeys.Add(item.Key);
                    }
                }
            }
            foreach (var dupkey in duplicateKeys)
            {
                // for analysis
                if (!string.IsNullOrEmpty(articleFileGT))
                {
                    correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> {
                        { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" }
                    });
                }
                checkTokens.Remove(dupkey);
            }


            //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ):
            errors = new Dictionary <int, string>();
            foreach (KeyValuePair <int, string> token in checkTokens)
            {
                // change Soewandi to Modern Spelling:
                string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant();

                // check word in Dictionary and Add to Error list if not there:
                int frequency;
                if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency))
                {
                    if (cbMethod.SelectedItem.ToString().Contains("Stemmer"))
                    {
                        // check again its stem in dictionary :
                        string stem = stemmer.Stemming(wordToCheck);
                        if (wordToCheck != stem && stemmer.checkStem(stem))
                        {
                            // for analysis
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" }
                                });
                            }
                        }
                        else // jika tidak ada di kamus:
                        {
                            errors.Add(token.Key, wordToCheck);
                        }
                    }
                    else
                    {
                        errors.Add(token.Key, wordToCheck);
                    }
                }
                else // jika ada di kamus:
                {
                    // for analysis
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" }
                        });
                    }
                }
            }


            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Google"))
            {
                timerGoogle.Enabled = true;
                indexTimerGoogle    = 0;
                return;
            }
            else
            {
                foreach (KeyValuePair <int, string> err in errors)
                {
                    //get suggestion:
                    string log; string suggestion;
                    suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption());

                    // Change suggestion back to Old Spell if any suggestions:
                    if (log != "No candidates")
                    {
                        suggestion = correction.ChangeNewToOldSpell(suggestion);
                    }

                    // update token dic with suggestion:
                    if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase))
                    {
                        deHyphenateTokens[err.Key] = "[" + suggestion + "]";
                    }

                    // for analysis:
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log }
                        });
                    }
                }
                ResetControls(true);
            }
        }
 public static string ExtractStemFeatureFromSingleTokenAndUpdateItemFeatures(Stemmer stemmer, Dictionary<string, double> item, string tokenKey)
 {
     tokenKey = stemmer.Stem(tokenKey);
     item.IncreaseFeatureFrequency("stem_" + tokenKey, 1);
     return tokenKey;
 }
Exemplo n.º 29
0
 private unsafe void IndexDocuments(WriteableIndex index, Stemmer stemmer, byte *buffer, List <(string id, int from, int length)> docs)